/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.viewer.align.commandline;
import org.fhcrc.cpl.viewer.commandline.modules.BaseViewerCommandLineModuleImpl;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.matching.FeatureSetMatcher;
import org.fhcrc.cpl.viewer.amt.*;
import org.fhcrc.cpl.viewer.align.Aligner;
import org.fhcrc.cpl.viewer.align.SplineAligner;
import org.fhcrc.cpl.viewer.align.QuantileRegressionAligner;
import org.fhcrc.cpl.viewer.align.PeptideArrayAnalyzer;
import org.fhcrc.cpl.toolbox.commandline.CommandLineModuleExecutionException;
import org.fhcrc.cpl.toolbox.commandline.CommandLineModule;
import org.fhcrc.cpl.toolbox.commandline.arguments.*;
import org.fhcrc.cpl.toolbox.ApplicationContext;
import org.fhcrc.cpl.toolbox.datastructure.Pair;
import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities;
import org.fhcrc.cpl.toolbox.statistics.BasicStatistics;
import org.fhcrc.cpl.toolbox.statistics.MatrixUtil;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithScatterPlot;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithHistogram;
import org.apache.log4j.Logger;
import org.jfree.chart.plot.XYPlot;
import java.util.*;
import java.util.List;
import java.io.File;
import java.io.IOException;
import java.awt.*;
/**
* Command linemodule for plotting the mass calibration of a feature file
*/
public class CollapseFractionsCLM extends BaseViewerCommandLineModuleImpl
implements CommandLineModule
{
protected static Logger _log = Logger.getLogger(CollapseFractionsCLM.class);
protected boolean showCharts = true;
protected PeptideArrayAnalyzer arrayAnalyzer;
protected boolean normalizeIntensities = true;
protected File outFile;
protected int minFeaturesInFracAndUnfrac = 15;
protected int minFeaturesForRegress = 15;
public CollapseFractionsCLM()
{
init();
}
protected void init()
{
mCommandName = "collapsefractions";
mShortDescription = "Collapse multiple fractions into a single featureset";
mHelpMessage = mShortDescription;
CommandLineArgumentDefinition[] argDefs =
{
createUnnamedFileArgumentDefinition(true,
"peptide array in which first run is unfractionated, all the rest fractionated"),
new BooleanArgumentDefinition("showcharts", false, "show charts?", showCharts),
new BooleanArgumentDefinition("normalize", false, "Normalize fraction intensities?", normalizeIntensities),
new FileToWriteArgumentDefinition("out", true, "output file"),
};
addArgumentDefinitions(argDefs);
}
public void assignArgumentValues()
throws ArgumentValidationException
{
showCharts = getBooleanArgumentValue("showcharts");
normalizeIntensities = getBooleanArgumentValue("normalize");
try
{
arrayAnalyzer = new PeptideArrayAnalyzer(getUnnamedFileArgumentValue());
}
catch (IOException e)
{
throw new ArgumentValidationException("Failed to load peptide array",e);
}
outFile = getFileArgumentValue("out");
}
/**
* do the actual work
*/
public void execute() throws CommandLineModuleExecutionException
{
List<Double[]> intensitiesAllRuns = new ArrayList<Double[]>();
for (int i=0; i<arrayAnalyzer.getRunCount(); i++)
intensitiesAllRuns.add(arrayAnalyzer.getRunIntensities(arrayAnalyzer.getRunNames().get(i)));
Double[] firstRunIntensities = intensitiesAllRuns.get(0);
if (normalizeIntensities)
normalizeIntensities(intensitiesAllRuns);
int nrow = arrayAnalyzer.getRowMaps().length;
Double[] featureFracIntensitySums = new Double[nrow];
//Do not add a sum value if there's a missing value between two non-missing values
int numFoundMissingBracketedByPresent = 0;
List<Integer> fractionCounts = new ArrayList<Integer>();
for (int i=0; i<nrow; i++)
{
double rowFracSum = 0;
boolean foundNonMissingFrac = false;
boolean foundMissingAfterNonMissing = false;
boolean foundNonMissingAfterMissingAfterNonMissing = false;
int fractionCount = 0;
for (int j=1; j<arrayAnalyzer.getRunCount(); j++)
{
if (intensitiesAllRuns.get(j)[i] != null)
{
fractionCount++;
foundNonMissingFrac = true;
if (foundMissingAfterNonMissing)
foundNonMissingAfterMissingAfterNonMissing = true;
rowFracSum += intensitiesAllRuns.get(j)[i];
// else ApplicationContext.infoMessage("Present, missing, present: " + intensitiesAllRuns.get(1)[i] + ", " + intensitiesAllRuns.get(2)[i] + ", " + intensitiesAllRuns.get(3)[i] + ", " + intensitiesAllRuns.get(4)[i]);
}
else
if (foundNonMissingFrac)
foundMissingAfterNonMissing = true;
}
if (foundNonMissingAfterMissingAfterNonMissing)
numFoundMissingBracketedByPresent++;
if (rowFracSum > 0 && !foundNonMissingAfterMissingAfterNonMissing)
{
featureFracIntensitySums[i] = rowFracSum;
fractionCounts.add(fractionCount);
}
else fractionCounts.add(0);
}
ApplicationContext.infoMessage("Number discarded because found missing in between present: " + numFoundMissingBracketedByPresent);
Map<Integer, List<Double>> unfracFractionInCommonLogIntensitiesMap = new HashMap<Integer, List<Double>>();
Map<Integer, List<Double>> fracFractionInCommonLogIntensitiesMap = new HashMap<Integer, List<Double>>();
for (int i=0; i<nrow; i++)
{
if (featureFracIntensitySums[i] != null && firstRunIntensities[i] != null)
{
int numFractionsUsed = fractionCounts.get(i);
List<Double> unfracLogIntensitiesThisFractionCount =
unfracFractionInCommonLogIntensitiesMap.get(numFractionsUsed);
List<Double> fracLogIntensitiesThisFractionCount =
fracFractionInCommonLogIntensitiesMap.get(numFractionsUsed);
if (unfracLogIntensitiesThisFractionCount == null)
{
unfracLogIntensitiesThisFractionCount = new ArrayList<Double>();
unfracFractionInCommonLogIntensitiesMap.put(numFractionsUsed,
unfracLogIntensitiesThisFractionCount);
fracLogIntensitiesThisFractionCount = new ArrayList<Double>();
fracFractionInCommonLogIntensitiesMap.put(numFractionsUsed,
fracLogIntensitiesThisFractionCount);
}
unfracLogIntensitiesThisFractionCount.add(Math.log(firstRunIntensities[i]));
fracLogIntensitiesThisFractionCount.add(Math.log(featureFracIntensitySums[i]));
}
}
//todo: technically this should only need to be done for runCount > 1
for (int i=1; i<=arrayAnalyzer.getRunCount(); i++)
{
if (unfracFractionInCommonLogIntensitiesMap.containsKey(i))
{
System.err.println("Found " + unfracFractionInCommonLogIntensitiesMap.get(i).size() +
" points in " + i + " fractions");
if (unfracFractionInCommonLogIntensitiesMap.get(i).size() < minFeaturesForRegress)
{
ApplicationContext.infoMessage("\tToo few for regression, so removing instead of adjusting");
int numRemoved = 0;
for (int j=0; j<featureFracIntensitySums.length; j++)
{
if (fractionCounts.get(j) == i && featureFracIntensitySums[j] != null)
{
featureFracIntensitySums[j] = null;
numRemoved++;
}
}
ApplicationContext.infoMessage("\tRemoved " + numRemoved + " intensities found in " + i + " fractions");
continue;
}
//normalize intensities per fraction
double[] coeffs = calcGoodRegressionCoeffs(fracFractionInCommonLogIntensitiesMap.get(i),
unfracFractionInCommonLogIntensitiesMap.get(i)) ;
if (normalizeIntensities)
{
for (int j=0; j<featureFracIntensitySums.length; j++)
{
if (fractionCounts.get(j) == i && featureFracIntensitySums[j] != null)
{
featureFracIntensitySums[j] =
RegressionUtilities.mapValueUsingCoefficients(coeffs, featureFracIntensitySums[j]);
}
}
}
if (showCharts)
{
PanelWithScatterPlot pwsp = new PanelWithScatterPlot(fracFractionInCommonLogIntensitiesMap.get(i),
unfracFractionInCommonLogIntensitiesMap.get(i), "in"+i+"fracs");
pwsp.setAxisLabels("Fractionated","Unfractionated");
pwsp.addLineOrCurve(coeffs);
pwsp.addLineOrCurve(new double[]{0,1});
pwsp.displayInTab();
}
if (normalizeIntensities)
{
for (int j=0; j<fracFractionInCommonLogIntensitiesMap.get(i).size(); j++)
fracFractionInCommonLogIntensitiesMap.get(i).set(j,
RegressionUtilities.mapValueUsingCoefficients(coeffs,
fracFractionInCommonLogIntensitiesMap.get(i).get(j)));
}
else
ApplicationContext.infoMessage("Note: Skipping intensity normalization for features in " + i + " fractions");
if (showCharts)
{
PanelWithScatterPlot pwsp = new PanelWithScatterPlot(fracFractionInCommonLogIntensitiesMap.get(i),
unfracFractionInCommonLogIntensitiesMap.get(i), "FracUnfrac"+i);
pwsp.addLineOrCurve(new double[] {0,1});
pwsp.displayInTab();
ApplicationContext.infoMessage("Log-intensity correlation between fractionated and unfrac for peptides in " + i + " fractions: " +
BasicStatistics.correlationCoefficient(unfracFractionInCommonLogIntensitiesMap.get(i),
fracFractionInCommonLogIntensitiesMap.get(i)));
}
}
}
//Now we write out a feature file with features with appropriate intensities
Map<Integer, Map<String, List<Feature>>> detailsMap = null;
try
{
ApplicationContext.infoMessage("Loading details file...");
detailsMap = arrayAnalyzer.loadDetailsRowMapsById();
ApplicationContext.infoMessage("Loaded " + detailsMap.size() + " array rows' details from details file");
}
catch (IOException e)
{
throw new CommandLineModuleExecutionException("Failed to process details file",e);
}
List<Feature> outFeatures = new ArrayList<Feature>();
for (int i=0; i<nrow; i++)
{
int id = i+1;
if (featureFracIntensitySums[i] == null)
continue;
double newIntensity = featureFracIntensitySums[i];
String firstAvailableRun = detailsMap.get(id).keySet().iterator().next();
Feature firstDetailFeature = detailsMap.get(id).get(firstAvailableRun).get(0);
double intensityScaleFactor = newIntensity / firstDetailFeature.getIntensity();
firstDetailFeature.setIntensity((float) newIntensity);
firstDetailFeature.setTotalIntensity((float) intensityScaleFactor * firstDetailFeature.getTotalIntensity());
outFeatures.add(firstDetailFeature);
}
ApplicationContext.infoMessage("Created " + outFeatures.size() + " features to save");
FeatureSet outFeatureSet = new FeatureSet();
outFeatureSet.setFeatures(outFeatures.toArray(new Feature[outFeatures.size()]));
try
{
outFeatureSet.save(outFile);
}
catch (IOException e)
{
throw new CommandLineModuleExecutionException("Failed to save output file",e);
}
if (showCharts)
{
new PanelWithHistogram(fractionCounts, "FractionCount").displayInTab();
List<Double> unfracInCommonLogIntensities = new ArrayList<Double>();
List<Double> fracInCommonLogIntensities = new ArrayList<Double>();
for (int i=0; i<nrow; i++)
{
if (featureFracIntensitySums[i] != null && firstRunIntensities[i] != null)
{
unfracInCommonLogIntensities.add(Math.log(firstRunIntensities[i]));
fracInCommonLogIntensities.add(Math.log(featureFracIntensitySums[i]));
}
}
PanelWithScatterPlot fracUnfracScatter =
new PanelWithScatterPlot(unfracInCommonLogIntensities, fracInCommonLogIntensities, "Frac vs unfrac");
fracUnfracScatter.setAxisLabels("Unfractionated","Fractionated");
fracUnfracScatter.addLineOrCurve(new double[] {0,1});
fracUnfracScatter.displayInTab();
for (int i=2; i<=arrayAnalyzer.getRunCount(); i++)
if (unfracFractionInCommonLogIntensitiesMap.containsKey(i))
{
}
// fracUnfracScatterByFracCount.setAxisLabels("Unfractionated","Fractionated");
// fracUnfracScatterByFracCount.setPointSize(2);
// fracUnfracScatterByFracCount.setShowLegend(true);
// fracUnfracScatterByFracCount.setSeriesColors(new Color[]{Color.BLUE, Color.RED, Color.GREEN, Color.ORANGE});
// fracUnfracScatterByFracCount.setSeriesColor(0, Color.BLUE);
// fracUnfracScatterByFracCount.setSeriesColor(1, Color.RED);
// fracUnfracScatterByFracCount.setSeriesColor(2, Color.GREEN);
// fracUnfracScatterByFracCount.setSeriesColor(3, Color.ORANGE);
// fracUnfracScatterByFracCount.addLineOrCurve(new double[] {0,1});
//
// fracUnfracScatterByFracCount.displayInTab();
ApplicationContext.infoMessage("Log-intensity correlation between fractionated and unfrac: " +
BasicStatistics.correlationCoefficient(unfracInCommonLogIntensities, fracInCommonLogIntensities));
ApplicationContext.infoMessage("In-common median intensity, frac: " +
BasicStatistics.median(fracInCommonLogIntensities) + ", no frac: " +
BasicStatistics.median(unfracInCommonLogIntensities));
List<Double> unfracNotNullLogIntensities = new ArrayList<Double>();
for (Double intensity : firstRunIntensities)
if (intensity != null) unfracNotNullLogIntensities.add(Math.log(intensity));
new PanelWithHistogram(unfracNotNullLogIntensities, "Unfractionated").displayInTab();
ApplicationContext.infoMessage("Unfractionated intensities: n=" + unfracNotNullLogIntensities.size() +
", median=" + BasicStatistics.median(unfracNotNullLogIntensities));
List<Double> fracNotNullLogIntensities = new ArrayList<Double>();
for (Double intensity : featureFracIntensitySums)
if (intensity != null)
{
fracNotNullLogIntensities.add(Math.log(intensity));
}
new PanelWithHistogram(fracNotNullLogIntensities, "Fractionated").displayInTab();
ApplicationContext.infoMessage("Fractionated intensities: n=" + fracNotNullLogIntensities.size() +
", median=" + BasicStatistics.median(fracNotNullLogIntensities));
}
}
protected void normalizeIntensities(List<Double[]> intensitiesAllRuns)
throws CommandLineModuleExecutionException
{
Double[] firstRunIntensities = intensitiesAllRuns.get(0);
PanelWithScatterPlot pwspAllRunsNorm = null;
if (showCharts)
{
pwspAllRunsNorm = new PanelWithScatterPlot();
pwspAllRunsNorm.setAxisLabels("Original","Mapped");
pwspAllRunsNorm.setName("RunNormalization");
}
ApplicationContext.infoMessage("Rows in array: " + firstRunIntensities.length);
for (int i=1; i<arrayAnalyzer.getRunCount(); i++)
{
ApplicationContext.infoMessage("Normalizing fraction " + i);
Double[] intensitiesThisRun = intensitiesAllRuns.get(i);
List<Double> firstRunIntensitiesJustThisRun = new ArrayList<Double>();
List<Double> thisRunIntensitiesJustThisRun = new ArrayList<Double>();
for (int j=0; j<firstRunIntensities.length; j++)
{
if (intensitiesThisRun[j] == null || firstRunIntensities[j] == null)
continue;
boolean foundInAnotherRun = false;
for (int k=1; k<arrayAnalyzer.getRunCount(); k++)
{
if (k!=i && intensitiesAllRuns.get(k)[j] != null)
{
foundInAnotherRun = true;
break;
}
}
if (!foundInAnotherRun)
{
firstRunIntensitiesJustThisRun.add(firstRunIntensities[j]);
thisRunIntensitiesJustThisRun.add(intensitiesThisRun[j]);
}
}
ApplicationContext.infoMessage("values in common between ONLY unfrac and fraction " + i + ":" +
firstRunIntensitiesJustThisRun.size());
List<Double> logFirst = new ArrayList<Double>();
List<Double> logThis = new ArrayList<Double>();
double minx = Double.MAX_VALUE;
double maxx = Double.MIN_VALUE;
for (int l=0; l<firstRunIntensitiesJustThisRun.size(); l++)
{
logFirst.add(Math.log(firstRunIntensitiesJustThisRun.get(l)));
logThis.add(Math.log(thisRunIntensitiesJustThisRun.get(l)));
minx = Math.min(Math.log(thisRunIntensitiesJustThisRun.get(l)), minx);
maxx = Math.max(Math.log(thisRunIntensitiesJustThisRun.get(l)), maxx);
}
if (logFirst.size() < minFeaturesInFracAndUnfrac)
{
ApplicationContext.infoMessage("WARNING: unable to normalize run " + i + ", not enough in common with unfractionated run");
continue;
}
double[] coeffs = null;
// try
// {
// //todo: un-hardcode
//// coeffs = RegressionUtilities.modalRegression(logFirst, logThis, 1, 6, 2.0);
// }
// catch (IOException e)
// {
// throw new CommandLineModuleExecutionException("Failure in modal regression",e);
// }
//todo: un-hardcode
Pair<double[], double[]> lowLeverageLowResPoints =
RegressionUtilities.selectValuesWithLowLeverageAndStudentizedResidual(logThis, logFirst,
17, 1.8, false, 1, false, true);
//try
//{
// coeffs = RegressionUtilities.modalRegression(
// lowLeverageLowResPoints.first, lowLeverageLowResPoints.second);
//}
//catch (IOException e)
//{throw new CommandLineModuleExecutionException(e);}
coeffs = calcGoodRegressionCoeffs(logThis, logFirst);
if (showCharts)
{
PanelWithScatterPlot pwsp = new PanelWithScatterPlot(lowLeverageLowResPoints.first,
lowLeverageLowResPoints.second, "logintensities" + i);
pwsp.addData(logThis, logFirst, "used datapoints");
pwsp.setAxisLabels("current fraction","unfractionated run");
pwsp.addLineOrCurve(coeffs, minx, maxx);
pwsp.displayInTab();
}
List<Double> origNotNullIntensities = null;
if (showCharts)
{
origNotNullIntensities = new ArrayList<Double>();
for (int l=0; l<intensitiesThisRun.length; l++)
{
if (intensitiesThisRun[l] != null)
{
origNotNullIntensities.add(Math.log(intensitiesThisRun[l]));
}
}
}
for (int l=0; l<intensitiesThisRun.length; l++)
{
if (intensitiesThisRun[l] != null)
intensitiesThisRun[l] = Math.exp(RegressionUtilities.mapValueUsingCoefficients(coeffs,
Math.log(intensitiesThisRun[l])));
}
if (showCharts)
{
List<Double> mappedNotNullIntensities = new ArrayList<Double>();
for (int l=0; l<intensitiesThisRun.length; l++)
{
if (intensitiesThisRun[l] != null)
{
mappedNotNullIntensities.add(Math.log(intensitiesThisRun[l]));
}
}
pwspAllRunsNorm.addData(origNotNullIntensities,
mappedNotNullIntensities, "mappedintensities" + i);
}
}
if (showCharts)
pwspAllRunsNorm.displayInTab();
}
protected double[] calcGoodRegressionCoeffs(List<Double> xvalsList, List<Double> yvalsList)
{
double[] xvals = new double[xvalsList.size()];
double[] yvals = new double[xvalsList.size()];
for (int i=0; i<xvals.length; i++)
{
xvals[i] = xvalsList.get(i);
yvals[i] = yvalsList.get(i);
}
return calcGoodRegressionCoeffs(xvals, yvals);
}
protected double[] calcGoodRegressionCoeffs(double[] xvals, double[] yvals)
{
Pair<double[], double[]> lowLeverageLowResPoints =
RegressionUtilities.selectValuesWithLowLeverageAndStudentizedResidual(xvals, yvals,
17, 1.8, false, 1, false, true);
//try
//{
// coeffs = RegressionUtilities.modalRegression(
// lowLeverageLowResPoints.first, lowLeverageLowResPoints.second);
//}
//catch (IOException e)
//{throw new CommandLineModuleExecutionException(e);}
return RegressionUtilities.symmetricalRegression(
lowLeverageLowResPoints.first, lowLeverageLowResPoints.second);
}
}