/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.viewer.align.commandline;
import org.fhcrc.cpl.viewer.commandline.modules.BaseViewerCommandLineModuleImpl;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.AnalyzeICAT;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.IsotopicLabelExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.Peptide;
import org.fhcrc.cpl.toolbox.proteomics.Protein;
import org.fhcrc.cpl.viewer.align.PeptideArrayAnalyzer;
import org.fhcrc.cpl.toolbox.commandline.CommandLineModuleExecutionException;
import org.fhcrc.cpl.toolbox.commandline.CommandLineModule;
import org.fhcrc.cpl.toolbox.commandline.arguments.*;
import org.fhcrc.cpl.toolbox.ApplicationContext;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithHistogram;
import org.fhcrc.cpl.toolbox.statistics.BasicStatistics;
import org.apache.log4j.Logger;
import java.util.*;
import java.io.File;
import java.io.IOException;
public class CaseControlRatioFeatureSetCLM extends BaseViewerCommandLineModuleImpl
implements CommandLineModule
{
protected static Logger _log = Logger.getLogger(CaseControlRatioFeatureSetCLM.class);
protected File file;
protected File outFile;
protected int minRunsPerGroup = 1;
Object[] arrayRows;
PeptideArrayAnalyzer peptideArrayAnalyzer = null;
protected float minGreaterIntensityForRatio = 0f;
protected File fastaFile;
protected boolean shouldAddRatiosForMissing = false;
protected int minPresentToAddMissingRatio = 2;
protected boolean winsorize = false;
//90% Winsorization means locking values to 5th percentile and 95th percentile
protected float winsorizePercent = 96;
public CaseControlRatioFeatureSetCLM()
{
init();
}
protected void init()
{
mCommandName = "casecontrolratiofeatureset";
mShortDescription = "Create a featureset with one feature per peptide array row with a peptide assignment " +
"and ratios for those rows with both case and control features";
mHelpMessage = mShortDescription;
CommandLineArgumentDefinition[] argDefs =
{
createUnnamedFileArgumentDefinition(true, "peptide array"),
new FileToReadArgumentDefinition("caserunlistfile",false,
"File containing the names of runs in the case group, one per line"),
new FileToReadArgumentDefinition("controlrunlistfile",false,
"File containing the names of runs in the control group, one per line"),
new IntegerArgumentDefinition("minrunspergroup",false,
"Minimum number of runs required for a feature to be included in the feature set",
minRunsPerGroup),
new FileToWriteArgumentDefinition("out", true, "Output pepXML file"),
new DecimalArgumentDefinition("mingreaterintensity", false,
"Minimum intensity for the /greater/ of the two mean intensity values, case or control",
minGreaterIntensityForRatio),
new FileToReadArgumentDefinition("fasta", false, "FASTA database to pretend the file came from"),
new BooleanArgumentDefinition("addmissing", false, "Add extreme ratios for peptides with at " +
"least mintoaddmissing values in one group and none in the other?", shouldAddRatiosForMissing),
new IntegerArgumentDefinition("mintoaddmissing", false, "if addmissing specified, minimum " +
"number of intensity values in the present group in order to add an extreme ratio",
minPresentToAddMissingRatio),
new BooleanArgumentDefinition("winsorize", false, "Winsorize missing ratios at 95% (default: " +
"use 0 and 999)", winsorize)
};
addArgumentDefinitions(argDefs);
}
public void assignArgumentValues()
throws ArgumentValidationException
{
file = getFileArgumentValue(CommandLineArgumentDefinition.UNNAMED_PARAMETER_VALUE_ARGUMENT);
fastaFile = getFileArgumentValue("fasta");
outFile = getFileArgumentValue("out");
minRunsPerGroup = getIntegerArgumentValue("minrunspergroup");
minGreaterIntensityForRatio = getFloatArgumentValue("mingreaterintensity");
minPresentToAddMissingRatio = getIntegerArgumentValue("mintoaddmissing");
shouldAddRatiosForMissing = getBooleanArgumentValue("addmissing");
winsorize = getBooleanArgumentValue("winsorize");
if (!shouldAddRatiosForMissing) assertArgumentAbsent("winsorize", "addmissing");
try
{
peptideArrayAnalyzer = new PeptideArrayAnalyzer(file);
}
catch (Exception e)
{
throw new ArgumentValidationException(e);
}
File caseRunListFile = getFileArgumentValue("caserunlistfile");
File controlRunListFile = getFileArgumentValue("controlrunlistfile");
if (peptideArrayAnalyzer.getRunNames().size() > 2)
{
assertArgumentPresent("caserunlistfile");
assertArgumentPresent("controlrunlistfile");
}
else
{
if (caseRunListFile == null)
{
ApplicationContext.infoMessage("Only two runs this file. First will be 'case', second 'control'");
peptideArrayAnalyzer.setCaseRunNames(new String[] { peptideArrayAnalyzer.getRunNames().get(0) });
peptideArrayAnalyzer.setControlRunNames(new String[] { peptideArrayAnalyzer.getRunNames().get(1) });
}
}
if (caseRunListFile != null)
{
try
{
peptideArrayAnalyzer.loadCaseControlRunListsFromFiles(caseRunListFile, controlRunListFile);
}
catch (IOException e)
{
throw new ArgumentValidationException("Failed to load case or control run list file", e);
}
ApplicationContext.infoMessage("Case runs:");
for (String caseRun : peptideArrayAnalyzer.getCaseRunNames())
ApplicationContext.setMessage("\t" + caseRun);
ApplicationContext.infoMessage("Control runs:");
for (String controlRun : peptideArrayAnalyzer.getControlRunNames())
ApplicationContext.setMessage("\t" + controlRun);
}
else
{
peptideArrayAnalyzer.setCaseRunNames(new String[] { peptideArrayAnalyzer.getRunNames().get(0)});
peptideArrayAnalyzer.setControlRunNames(new String[] { peptideArrayAnalyzer.getRunNames().get(1)});
}
}
/**
* do the actual work
*/
public void execute() throws CommandLineModuleExecutionException
{
List<Feature> features = new ArrayList<Feature>();
//todo: un-hardcode
AnalyzeICAT.IsotopicLabel label = new AnalyzeICAT.IsotopicLabel("Acrylamide (D0/D3)",71.03657F,3.0188F,'C',
AnalyzeICAT.DEFAULT_MAX_LABEL_COUNT);
int numFeaturesWithRatios = 0;
int numFeaturesRatiosInsufficientIntensity = 0;
int numFeaturesWithPeptides = 0;
Set<String> allPeptides = new HashSet<String>();
List<Feature> extremeRatioFeatures = new ArrayList<Feature>();
List<Float> nonMissingRatios = new ArrayList<Float>();
for (Map<String, Object> rowMap : peptideArrayAnalyzer.getRowMaps())
{
Set<String> rowPeptides = peptideArrayAnalyzer.getAllRowPeptides(rowMap);
if (rowPeptides.size() != 1)
continue;
List<Double> caseIntensities = new ArrayList<Double>();
for (String caseRun : peptideArrayAnalyzer.getCaseRunNames())
{
Double intensity = peptideArrayAnalyzer.getRunIntensity(rowMap, caseRun);
if (intensity != null)
caseIntensities.add(intensity);
}
List<Double> controlIntensities = new ArrayList<Double>();
for (String run : peptideArrayAnalyzer.getControlRunNames())
{
Double intensity = peptideArrayAnalyzer.getRunIntensity(rowMap, run);
if (intensity != null)
controlIntensities.add(intensity);
}
if (caseIntensities.size() == 0 && controlIntensities.size() == 0)
continue;
numFeaturesWithPeptides++;
Feature rowFeature = new Feature();
String peptide = rowPeptides.iterator().next();
allPeptides.add(peptide);
rowFeature.setCharge(Integer.parseInt(rowMap.get("charge").toString()));
if (rowMap.containsKey("minScan"))
rowFeature.setScan((int) Double.parseDouble(rowMap.get("minScan").toString()));
else
{
rowFeature.setScan((int) Double.parseDouble(rowMap.get("minTime").toString()));
}
MS2ExtraInfoDef.setPeptideList(rowFeature, peptide);
MS2ExtraInfoDef.setPeptideProphet(rowFeature, 0.95);
Protein dummyProtein = new Protein("asdf", peptide.getBytes());
rowFeature.setMass((float) new Peptide(dummyProtein, 0, peptide.length()).getMonoisotopicMass());
rowFeature.updateMz();
//if (peptide.equals("FLGVAEQLHNEGFK")) System.err.println("FLGVAEQLHNEGFK, " + caseIntensities.size() +", " + controlIntensities.size());
if (caseIntensities.size() >= minRunsPerGroup && controlIntensities.size() >= minRunsPerGroup)
{
double geoMeanCase = BasicStatistics.geometricMean(caseIntensities);
double geoMeanControl = BasicStatistics.geometricMean(controlIntensities);
//only add a ratio if at least one of the intensities being compared is above threshold
if (Math.max(geoMeanCase, geoMeanControl) > minGreaterIntensityForRatio)
{
double ratio = geoMeanCase / geoMeanControl;
IsotopicLabelExtraInfoDef.setRatio(rowFeature, ratio);
IsotopicLabelExtraInfoDef.setLabel(rowFeature, label);
IsotopicLabelExtraInfoDef.setHeavyFirstScan(rowFeature, rowFeature.getScan());
IsotopicLabelExtraInfoDef.setHeavyLastScan(rowFeature, rowFeature.getScan());
IsotopicLabelExtraInfoDef.setLightFirstScan(rowFeature, rowFeature.getScan());
IsotopicLabelExtraInfoDef.setLightLastScan(rowFeature, rowFeature.getScan());
IsotopicLabelExtraInfoDef.setLightIntensity(rowFeature, geoMeanCase);
IsotopicLabelExtraInfoDef.setHeavyIntensity(rowFeature, geoMeanControl);
nonMissingRatios.add((float) ratio);
numFeaturesWithRatios++;
//if (peptide.equals("FLGVAEQLHNEGFK")) System.err.println("\tratio: " + ratio);
}
else
numFeaturesRatiosInsufficientIntensity++;
}
else
{
//check if we should add a fake ratio for missing data in one group or the other.
//if case missing, use 0. If control missing, use 999
if (shouldAddRatiosForMissing)
{
if ((caseIntensities.size() >= minPresentToAddMissingRatio && controlIntensities.size() == 0) ||
(controlIntensities.size() >= minPresentToAddMissingRatio && caseIntensities.size() == 0))
{
IsotopicLabelExtraInfoDef.setLabel(rowFeature, label);
IsotopicLabelExtraInfoDef.setHeavyFirstScan(rowFeature, rowFeature.getScan());
IsotopicLabelExtraInfoDef.setHeavyLastScan(rowFeature, rowFeature.getScan());
IsotopicLabelExtraInfoDef.setLightFirstScan(rowFeature, rowFeature.getScan());
IsotopicLabelExtraInfoDef.setLightLastScan(rowFeature, rowFeature.getScan());
numFeaturesWithRatios++;
if (caseIntensities.size() == 0)
{
IsotopicLabelExtraInfoDef.setRatio(rowFeature, 0);
IsotopicLabelExtraInfoDef.setLightIntensity(rowFeature, 0);
IsotopicLabelExtraInfoDef.setHeavyIntensity(rowFeature, BasicStatistics.geometricMean(controlIntensities));
}
else
{
IsotopicLabelExtraInfoDef.setRatio(rowFeature, 999);
IsotopicLabelExtraInfoDef.setLightIntensity(rowFeature, BasicStatistics.geometricMean(caseIntensities));
IsotopicLabelExtraInfoDef.setHeavyIntensity(rowFeature, 0);
}
}
extremeRatioFeatures.add(rowFeature);
}
}
features.add(rowFeature);
}
if (winsorize)
{
float winsorMinRatio = (float) BasicStatistics.percentile(nonMissingRatios, (100-winsorizePercent)/2);
float winsorMaxRatio = (float) BasicStatistics.percentile(nonMissingRatios, winsorizePercent + (100-winsorizePercent)/2);
ApplicationContext.infoMessage("Winsorizing: min=" + winsorMinRatio + ", max=" + winsorMaxRatio);
List<Float> logRatiosBeforeWinsor = new ArrayList<Float>();
for (float ratio : nonMissingRatios)
logRatiosBeforeWinsor.add((float) Math.log(ratio));
new PanelWithHistogram(logRatiosBeforeWinsor, "LogRatio preWinsor", 100).displayInTab();
List<Float> logRatiosAfterWinsor = new ArrayList<Float>();
for (Feature feature : features)
{
if (IsotopicLabelExtraInfoDef.hasRatio(feature))
{
float ratio = (float) IsotopicLabelExtraInfoDef.getRatio(feature);
float lightIntensity = (float) IsotopicLabelExtraInfoDef.getLightIntensity(feature);
float heavyIntensity = (float) IsotopicLabelExtraInfoDef.getHeavyIntensity(feature);
// if (lightIntensity == 0 && heavyIntensity == 0)
// throw new RuntimeException ("light & heavy 0");
if (ratio < winsorMinRatio)
{
ratio = winsorMinRatio;
lightIntensity = ratio * heavyIntensity;
}
if (ratio > winsorMaxRatio)
{
ratio = winsorMaxRatio;
heavyIntensity = lightIntensity / ratio;
}
logRatiosAfterWinsor.add((float) Math.log(ratio));
IsotopicLabelExtraInfoDef.setRatio(feature, ratio);
IsotopicLabelExtraInfoDef.setLightIntensity(feature, lightIntensity);
IsotopicLabelExtraInfoDef.setHeavyIntensity(feature, heavyIntensity);
}
}
new PanelWithHistogram(logRatiosAfterWinsor, "LogRatio postWinsor", 100).displayInTab();
}
ApplicationContext.infoMessage("Rows with one peptide: " + numFeaturesWithPeptides);
ApplicationContext.infoMessage("Unique peptides: " + allPeptides.size());
ApplicationContext.infoMessage("Ratios created for " + numFeaturesWithRatios + " features. Without: " + (numFeaturesWithPeptides - numFeaturesWithRatios));
ApplicationContext.infoMessage("Features with intensities too low for ratio: " + numFeaturesRatiosInsufficientIntensity);
FeatureSet featureSet = new FeatureSet(features.toArray(new Feature[features.size()]));
if (fastaFile != null)
MS2ExtraInfoDef.setFeatureSetSearchDatabasePath(featureSet, fastaFile.getAbsolutePath());
try
{
featureSet.savePepXml(outFile);
ApplicationContext.infoMessage("Saved feature file " + outFile.getAbsolutePath());
}
catch (Exception e)
{
throw new CommandLineModuleExecutionException("Failed to save featureset",e);
}
}
}