/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.viewer.amt;
import java.util.*;
import java.io.File;
import org.apache.log4j.Logger;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet;
import org.fhcrc.cpl.toolbox.proteomics.feature.matching.FeatureSetMatcher;
import org.fhcrc.cpl.toolbox.proteomics.feature.matching.BaseFeatureSetMatcherImpl;
import org.fhcrc.cpl.toolbox.proteomics.feature.matching.Window2DFeatureSetMatcher;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.AmtExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.feature.filehandler.PepXMLFeatureFileHandler;
import org.fhcrc.cpl.toolbox.proteomics.MSRun;
import org.fhcrc.cpl.toolbox.proteomics.ProteomicsRegressionUtilities;
import org.fhcrc.cpl.toolbox.commandline.CommandLineModuleUtilities;
import org.fhcrc.cpl.toolbox.ApplicationContext;
import org.fhcrc.cpl.toolbox.statistics.BasicStatistics;
import org.fhcrc.cpl.toolbox.datastructure.Pair;
import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities;
import org.fhcrc.cpl.toolbox.proteomics.MS2Modification;
import org.fhcrc.cpl.toolbox.proteomics.filehandler.PepXmlLoader;
/**
* Builds AMT databases, in various ways
*
* If you want to build an AMT database in some other way, put a method in here to do it
* your way.
*/
public class AmtDatabaseBuilder
{
static Logger _log = Logger.getLogger(AmtDatabaseBuilder.class);
//default cutoff values for tossing out observations with high studentized residuals
//Marty suggested 2.3 for this
public static final double DEFAULT_MAX_STUDENTIZED_RESIDUAL_FOR_REGRESSION = 2.0;
//Marty suggested 3.0 for this
public static final double DEFAULT_MAX_STUDENTIZED_RESIDUAL_FOR_INCLUSION = 2.0;
//default parameters for median alignment
public static final int DEFAULT_MIN_PEPTIDES_FOR_ALIGNMENT_REGRESSION=8;
public static final int DEFAULT_MIN_OBSERVATIONS_FOR_ALIGNMENT_REGRESSION=3;
//Default maximum leverage numerator (X/n) to keep for regression
public static final double DEFAULT_MAX_LEVERAGE_NUMERATOR = 4;
public static final float DEFAULT_MS1_MS2_MASS_TOLERANCE_PPM = 10;
public static final float DEFAULT_MS1_MS2_TIME_TOLERANCE_SECONDS = 25;
protected float ms1Ms2MassTolerancePPM = DEFAULT_MS1_MS2_MASS_TOLERANCE_PPM;
protected float ms1Ms2TimeToleranceSeconds = DEFAULT_MS1_MS2_TIME_TOLERANCE_SECONDS;
//If this is true, ignore any unknown modifications -- put the peptide in as though the
//modification didn't exist
public static final boolean DEFAULT_IGNORE_UNKNOWN_MODIFICATIONS = false;
protected boolean ignoreUnknownModifications = DEFAULT_IGNORE_UNKNOWN_MODIFICATIONS;
/**
* Create the default N-Terminal modifications that are used by X!Tandem.
* In later X!Tandem versions, these will be declared explicitly.
* If they're not, need to add them.
* @return
*/
public static MS2Modification[] createDefaultNTerminalModifications()
{
MS2Modification eMod = new MS2Modification();
eMod.setAminoAcid("E");
eMod.setMassDiff(-18.0106f);
eMod.setVariable(true);
MS2Modification qMod = new MS2Modification();
qMod.setAminoAcid("Q");
qMod.setMassDiff(-17.0265f);
qMod.setVariable(true);
return new MS2Modification[]
{
eMod,
qMod
};
}
/**
* calculate observed hydrophobicity for all features and
* add them all as observations to a new database.
*
* This one's the workhorse. Everything else calls it eventually
*
* @param scanOrTimeMode
* @return
*/
public static AmtDatabase createAmtDatabaseForRun(FeatureSet featureSet,
int scanOrTimeMode,
double[] timeToHCoefficients,
boolean calculateHydrophobicities,
Map<String, Integer> spectralCountsMap,
boolean ignoreUnknownModifications)
{
//build a map of known offsets from the features
Feature[] features = featureSet.getFeatures();
if (calculateHydrophobicities)
AmtUtilities.addHydrophobicityToFeatures(
featureSet.getFeatures(), scanOrTimeMode,
timeToHCoefficients);
//Now that we've got hydrophobicity on the features, create a db
AmtDatabase result = new AmtDatabase();
AmtRunEntry runEntry = new AmtRunEntry(timeToHCoefficients,
null);
MS2Modification[] runModifications = MS2ExtraInfoDef.getFeatureSetModifications(featureSet);
//check for explicitly-declared E and Q modifications. If they're not declared, add them
boolean hasEorQMod = false;
if (runModifications != null)
{
for (MS2Modification mod : runModifications)
{
String aa = mod.getAminoAcid();
if ("E".equals(aa) || "Q".equals(aa))
{
hasEorQMod=true;
break;
}
}
}
if (!hasEorQMod)
{
_log.debug("Adding default E and Q modifications, not specified explicitly in file");
int existingModsLength=0;
if (runModifications != null)
existingModsLength = runModifications.length;
MS2Modification[] defaultEQMods = createDefaultNTerminalModifications();
MS2Modification[] newModifications =
new MS2Modification[existingModsLength + defaultEQMods.length];
if (runModifications != null)
System.arraycopy(runModifications, 0, newModifications, 0, existingModsLength);
System.arraycopy(defaultEQMods, 0, newModifications, existingModsLength, defaultEQMods.length);
runModifications = newModifications;
}
runEntry.setModifications(runModifications);
if (_log.isDebugEnabled())
{
if (MS2ExtraInfoDef.getFeatureSetModifications(featureSet) != null)
for (MS2Modification mod : runModifications)
_log.debug("Adding modification from run: " + mod);
}
File sourceFile = featureSet.getSourceFile();
if (sourceFile != null)
runEntry.setPepXmlFilename(featureSet.getSourceFile().getName());
result.addRunEntry(runEntry);
for (Feature feature : features)
{
String peptideSequence = MS2ExtraInfoDef.getFirstPeptide(feature);
if (peptideSequence.contains("X"))
{
_log.debug("Skipping peptide sequence with aminoacid 'X'. Peptide: " +
peptideSequence);
}
else
{
//first try to resolve the modifications. If that works, fine. If not, then either
//bomb out, or throw a warning, depending on ignoreUnknownModifications
List<MS2Modification>[] ms2Modifications =
(List<MS2Modification>[]) new List[peptideSequence.length()];
try
{
ms2Modifications = result.resolveMods(peptideSequence,
MS2ExtraInfoDef.getModifiedAminoAcids(feature),
runEntry);
}
catch (IllegalArgumentException e)
{
if (ignoreUnknownModifications)
{
ApplicationContext.infoMessage("WARNING: could not resolve all peptide modifications: " +
e.getMessage());
}
else
{
throw e;
}
}
result.addObservation(peptideSequence, ms2Modifications, MS2ExtraInfoDef.getPeptideProphet(feature),
AmtExtraInfoDef.getObservedHydrophobicity(feature), runEntry,
spectralCountsMap, feature.getTime());
}
}
return result;
}
protected static Map<String, Double> performInitialRegression(Feature[] allFeatures,
int scanOrTimeMode)
{
Feature[] featuresForFirstRegression =
ProteomicsRegressionUtilities.selectFeaturesWithLowLeverage(allFeatures,
DEFAULT_MAX_LEVERAGE_NUMERATOR, scanOrTimeMode);
//Do an initial regression
return AmtUtilities.calculateScanOrTimeHydrophobicityRelationship(
featuresForFirstRegression,
scanOrTimeMode,
false);
}
/**
* Calculate "studentized residuals" for each feature.
*
* This is based on the residual of the feature, and on its leverage. It's a measure
* of the certainty that the feature is messed up w.r.t. the rest of the features.
*
* @param allFeatures
* the residual squared must be in order to be dropped from the set
* @return
*/
public static double[] calculateStudentizedResiduals(Feature[] allFeatures,
int scanOrTimeMode)
{
//number of features
int n = allFeatures.length;
if (n==0)
throw new IllegalArgumentException("ERROR! No features!");
if (scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_TIME)
{
boolean nonzeroTimesExist = false;
for (Feature feature : allFeatures)
{
if (feature.getTime() > 0)
{
nonzeroTimesExist = true;
break;
}
}
if (!nonzeroTimesExist)
{
throw new IllegalArgumentException("ERROR! Feature retention times are all zero! " +
"Please populate feature " +
"times (e.g., using the 'populatems2times' command), or provide associated mzXML file(s) to " +
"use for populating scan times.");
}
}
//Do an initial regression using all features
Map<String, Double> scanOrTimeToHydroLine =
performInitialRegression(allFeatures, scanOrTimeMode);
//residuals based on the initial regression (y - yhat)
double[] hydrophobicityResiduals = new double[n];
//scan/time values for all features (x)
double[] scanOrTimeValues = new double[n];
//gather the information we need about each feature
for (int i=0; i<n; i++)
{
//record all x values
scanOrTimeValues[i] = ((scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_TIME) ?
allFeatures[i].getTime() :
allFeatures[i].getScan());
//this value was already implicitly calculated above, when regression was
//performed, but we don't have easy access to it. Anyway, fast.
//This is yhat
double predictedH =
RegressionUtilities.predictYFromX(
AmtUtilities.getSlopeFromRegressionLine(scanOrTimeToHydroLine),
AmtUtilities.getInterceptFromRegressionLine(scanOrTimeToHydroLine),
scanOrTimeValues[i]);
//calculated H: y
double calculatedH =
AmtUtilities.calculateNormalizedHydrophobicity(
MS2ExtraInfoDef.getFirstPeptide(allFeatures[i]));
//residual is y - yhat
hydrophobicityResiduals[i] = calculatedH - predictedH;
}
//sigmaHydroError requires a denomenator of n-2, so can't use
//BasicStatistics.standardDeviation
double sumSquaredResiduals=0;
for (double residual : hydrophobicityResiduals)
sumSquaredResiduals += Math.pow(residual, 2);
double sigmaHydroError = Math.sqrt(sumSquaredResiduals / (n - 2));
//calculate the leverages of all values
double[] leverages = BasicStatistics.leverages(scanOrTimeValues);
//todo: switch this over to BasicStatistics.studentizedResiduals().
//Not doing it now because right before 2.0 release
double[] studentizedResiduals = new double[n];
//1 + 1/n. Same for all inputs, compute once
double onePlusOneOverN = 1 + 1/n;
//Find the "studentized residual" of every feature:
for (int i=0; i < n; i++)
{
double scaledResidual = hydrophobicityResiduals[i] / sigmaHydroError;
studentizedResiduals[i] = scaledResidual /
Math.sqrt(onePlusOneOverN + leverages[i]);
}
_log.debug("studentized residual mean: " + BasicStatistics.mean(studentizedResiduals) + ", stddev: " + BasicStatistics.standardDeviation(studentizedResiduals));
return studentizedResiduals;
}
/**
* Given an array of features, choose just the ones with residual <=
* of maxStudentizedResidual. This is done twice, so it's a method.
* @param allFeatures
* @param studentizedResiduals
* @param maxStudentizedResidual
* @return
*/
public static Feature[]
chooseFeaturesWithMaxStudentizedResidual(Feature[] allFeatures,
double[] studentizedResiduals,
double maxStudentizedResidual)
{
//square the max studentized residual, compare with square of studentized
//residual (since studentized residual is signed)
double maxStudentizedResidualSquared = Math.pow(maxStudentizedResidual,2);
List<Feature> passingFeatures = new ArrayList<Feature>();
for (int i=0; i<allFeatures.length; i++)
{
if (Math.pow(studentizedResiduals[i],2) <= maxStudentizedResidualSquared)
passingFeatures.add(allFeatures[i]);
}
return passingFeatures.toArray(new Feature[0]);
}
/**
* Count the spectra with each peptide identification. Assumes filtering
* has already been done such that these features are the ones we consider good
* @param features
* @return
*/
protected static Map<String,Integer> countSpectraForPeptides(Feature[] features)
{
Map<String, Integer> result = new HashMap<String, Integer>();
for (Feature feature : features)
{
String peptide = MS2ExtraInfoDef.getFirstPeptide(feature);
if (peptide == null)
continue;
if (result.containsKey(peptide))
result.put(peptide, result.get(peptide) + 1);
else
result.put(peptide, 1);
}
return result;
}
/**
* Create an AMT database for a single run, doing those arcane things we do
* to cast out the unworthy features. Here's the sequence:
* 1. Calculate the leverage of all features
* 2. Toss out all features with leverage > 4/n, perform a regression to
* map time to normalized H (first cut)
* 3. Calculate the studentized residual of all features from the original set
* 4. Take out features with studentized residual < 2.0, perform a regression
* to map time to normalized H (final version)
* 5. Take features with studentized residual < 2.0, insert those peptides
* into the database
*
*
* @param ms2FeatureSet
* @param ms1FeatureSet
* @param scanOrTimeMode
* @param robustRegression
* @param outNumFeaturesChosen
* @param maxStudentizedResidualForRegression
* @param maxStudentizedResidualForInclusion
* @return
*/
public AmtDatabase createAmtDatabaseForRun(FeatureSet ms2FeatureSet,
FeatureSet ms1FeatureSet,
int scanOrTimeMode,
boolean robustRegression,
Pair<Integer, Integer> outNumFeaturesChosen,
double maxStudentizedResidualForRegression,
double maxStudentizedResidualForInclusion)
{
Map<String, Integer> peptideSpectralCountsMap =
countSpectraForPeptides(ms2FeatureSet.getFeatures());
FeatureSet featureSetForProcessing = null;
if (ms1FeatureSet != null)
{
keepOnlySinglyMatchedMS1Features(ms2FeatureSet, ms1FeatureSet);
featureSetForProcessing = ms1FeatureSet;
}
else
{
AmtDatabaseMatcher.representPeptidesWithMedianTimePerPeptidePerMod(ms2FeatureSet);
featureSetForProcessing = ms2FeatureSet;
}
if (featureSetForProcessing.getFeatures().length == 0)
{
ApplicationContext.infoMessage("\tNo features to add to database! Skipping.");
return null;
}
ApplicationContext.infoMessage("\tFeatures to add to database: " + featureSetForProcessing.getFeatures().length);
Feature[] allProcessingFeatures = featureSetForProcessing.getFeatures();
double[] studentizedResiduals =
calculateStudentizedResiduals(allProcessingFeatures,
scanOrTimeMode);
Feature[] featuresForRegression =
chooseFeaturesWithMaxStudentizedResidual(allProcessingFeatures,
studentizedResiduals,
maxStudentizedResidualForRegression);
Map<String, Double> timeHydroLine =
AmtUtilities.calculateScanOrTimeHydrophobicityRelationship(
featuresForRegression,
scanOrTimeMode,
robustRegression);
double[] timeToHCoefficients = new double[2];
timeToHCoefficients[1] = AmtUtilities.getSlopeFromRegressionLine(timeHydroLine);
timeToHCoefficients[0] = AmtUtilities.getInterceptFromRegressionLine(timeHydroLine);
if (AmtUtilities.getSlopeFromRegressionLine(timeHydroLine) < 0)
ApplicationContext.infoMessage("WARNING: negative regression line slope!");
//the features for inclusion in the database may be chosen based on a different
//cutoff from those chosen for regression.
//TODO: Should we be recalculating the studentized residuals based on the prediction line
//we calculated?
Feature[] featuresForInclusion =
chooseFeaturesWithMaxStudentizedResidual(allProcessingFeatures,
studentizedResiduals,
maxStudentizedResidualForInclusion);
outNumFeaturesChosen.first = featuresForRegression.length;
outNumFeaturesChosen.second = featuresForInclusion.length;
_log.debug("Chose " + featuresForRegression.length + "/" + allProcessingFeatures.length +
" features for regression");
_log.debug("Chose " + featuresForInclusion.length + "/" + allProcessingFeatures.length +
" features for inclusion");
FeatureSet featureSetForInclusion = new FeatureSet(featuresForInclusion);
featureSetForInclusion.setSourceFile(featureSetForProcessing.getSourceFile());
MS2Modification[] modifications = MS2ExtraInfoDef.getFeatureSetModifications(featureSetForProcessing);
_log.debug("Setting modifications: " + modifications);
MS2ExtraInfoDef.setFeatureSetModifications(featureSetForInclusion,
modifications);
AmtDatabase amtDatabase =
createAmtDatabaseForRun(featureSetForInclusion, scanOrTimeMode,
timeToHCoefficients,
true, peptideSpectralCountsMap, ignoreUnknownModifications);
if (_log.isDebugEnabled())
{
int numTotalObservations = 0;
for (AmtPeptideEntry peptideEntry : amtDatabase.getEntries())
numTotalObservations += peptideEntry.getNumObservations();
_log.debug("Number of total observations: " + numTotalObservations);
}
return amtDatabase;
}
/**
* Given a directory full of pepXml files and a directory full of the
* corresponding mzXML files, create an AMT database from all the pepXml files,
* using the mzXML files to grab retention times.
*
* Highly dependent on naming conventions. If you're starting with an all.pep.xml
* file, use the method below instead.
*
* The nice thing about this is that it can check for existence of all the
* mzXml files before starting work.
* @param pepXmlDir
* @param mzXmlDir
* @param scanOrTimeMode
* @return
* @throws Exception
*/
public AmtDatabase createAmtDBFromDirectories(File pepXmlDir, File ms1Dir, File mzXmlDir,
int scanOrTimeMode,
FeatureSet.FeatureSelector featureSelector,
boolean robustRegression,
double maxStudentizedResidualForRegression,
double maxStudentizedResidualForInclusion,
boolean align)
throws Exception
{
boolean getTimesFromMzxml = (mzXmlDir != null);
AmtDatabase resultDB = new AmtDatabase();
List<File> ms2FeatureFiles = new ArrayList<File>();
Map<File,File> pepXmlMzXmlFileMap = new HashMap<File,File>();
String[] potentialFiles = pepXmlDir.list();
Arrays.sort(potentialFiles);
for (String pepXmlFileName : potentialFiles)
{
File ms2FeatureFile =
new File(pepXmlDir.getAbsolutePath() + File.separatorChar +
pepXmlFileName);
if (ms2FeatureFile.exists() && !ms2FeatureFile.isDirectory())
ms2FeatureFiles.add(ms2FeatureFile);
}
if (getTimesFromMzxml)
{
for (File pepXmlFile : ms2FeatureFiles)
{
File mzXmlFile = CommandLineModuleUtilities.findFileLikeFile(pepXmlFile, mzXmlDir,"mzXML");
pepXmlMzXmlFileMap.put(pepXmlFile, mzXmlFile);
}
}
Map<File,File> pepXmlMS1FileMap = new HashMap<File,File>();
if (ms1Dir != null)
{
for (File pepXmlFile : ms2FeatureFiles)
{
File ms1File = CommandLineModuleUtilities.findFileLikeFile(pepXmlFile, ms1Dir, "");
pepXmlMS1FileMap.put(pepXmlFile, ms1File);
}
}
//for each pair of pepXml and mzXml files, create an amt database
//and add its observations, runs and mods to the cumulative db
int numFeaturesChosenForRegression = 0;
int numFeaturesChosenForInclusion = 0;
Pair<Integer,Integer> numFeaturesDummy = new Pair<Integer,Integer>(0,0);
for (File ms2FeatureFile : ms2FeatureFiles)
{
ApplicationContext.infoMessage("Adding features from file " + ms2FeatureFile.getAbsolutePath());
FeatureSet ms2FeatureSet = new FeatureSet(ms2FeatureFile);
ms2FeatureSet = ms2FeatureSet.filter(featureSelector);
if (getTimesFromMzxml)
{
File mzXmlFile = pepXmlMzXmlFileMap.get(ms2FeatureFile);
_log.debug("Populating times for MS2 file " + ms2FeatureFile + " from mzXML file " + mzXmlFile);
MSRun thisRun = MSRun.load(mzXmlFile.getAbsolutePath());
ms2FeatureSet.populateTimesForMS2Features(thisRun);
}
FeatureSet ms1FeatureSet = null;
if (pepXmlMS1FileMap.get(ms2FeatureFile) != null)
{
File ms1File = pepXmlMS1FileMap.get(ms2FeatureFile);
ms1FeatureSet = new FeatureSet(ms1File);
ApplicationContext.infoMessage("\tUsing " + ms1FeatureSet.getFeatures().length +
" MS1 features from file " + ms1File.getAbsolutePath());
}
if (align)
{
addRunToAmtDatabase(resultDB, ms2FeatureSet, ms1FeatureSet,
scanOrTimeMode, robustRegression, maxStudentizedResidualForRegression,
maxStudentizedResidualForInclusion, 2, .05, 10);
}
else
{
AmtDatabase thisRunDB =
createAmtDatabaseForRun(ms2FeatureSet, ms1FeatureSet,
scanOrTimeMode, robustRegression,
numFeaturesDummy, maxStudentizedResidualForRegression,
maxStudentizedResidualForInclusion);
if (thisRunDB != null)
{
numFeaturesChosenForRegression += numFeaturesDummy.first;
numFeaturesChosenForInclusion += numFeaturesDummy.second;
resultDB.addObservationsFromAnotherDatabase(thisRunDB);
_log.debug("Features chosen for regression: " + numFeaturesChosenForRegression);
ApplicationContext.infoMessage("\tPeptides from this run: " + thisRunDB.numEntries() +
", running count: " + resultDB.numEntries());
}
}
}
return resultDB;
}
/**
*
* @param ms2FeatureSet
* @param ms1FeatureSet
*/
protected void keepOnlySinglyMatchedMS1Features(FeatureSet ms2FeatureSet, FeatureSet ms1FeatureSet)
{
int numTotalMS1Features=ms1FeatureSet.getFeatures().length;
Window2DFeatureSetMatcher featureSetMatcher =
new Window2DFeatureSetMatcher();
featureSetMatcher.setMassDiffType(FeatureSetMatcher.DELTA_MASS_TYPE_PPM);
featureSetMatcher.setMaxMassDiff(ms1Ms2MassTolerancePPM);
featureSetMatcher.setMinMassDiff(-ms1Ms2MassTolerancePPM);
featureSetMatcher.setMaxElutionDiff(ms1Ms2TimeToleranceSeconds);
featureSetMatcher.setMinElutionDiff(-ms1Ms2TimeToleranceSeconds);
featureSetMatcher.setElutionMode(BaseFeatureSetMatcherImpl.ELUTION_MODE_TIME);
featureSetMatcher.setMatchWithinChargeOnly(true);
FeatureSetMatcher.FeatureMatchingResult ms1MS2MatchingResult =
featureSetMatcher.matchFeatures(ms1FeatureSet, ms2FeatureSet);
if (_log.isDebugEnabled())
{
List<Float> deltaMasses = new ArrayList<Float>();
List<Float> deltaTimes = new ArrayList<Float>();
for (Feature feature : ms1MS2MatchingResult.getMasterSetFeatures())
{
List<Feature> ms2MatchedFeatures = ms1MS2MatchingResult.getSlaveSetFeatures(feature);
for (Feature ms2Feature : ms2MatchedFeatures)
{
deltaMasses.add((ms2Feature.getMass() - feature.getMass()) * 1000000 / ms2Feature.getMass());
deltaTimes.add(ms2Feature.getTime() - feature.getTime());
}
}
float deltaMassStdDev = BasicStatistics.standardDeviationFloatList(deltaMasses);
float deltaTimeStdDev = BasicStatistics.standardDeviationFloatList(deltaTimes);
//PanelWithHistogram pwh2 = new PanelWithHistogram(deltaTimes, "deltaTime");
//pwh2.displayDialog("deltaTime");
_log.debug("Matching MS1 to MS2: 3 standard deviations: mass=" + (3 * deltaMassStdDev) +
", time=" + (3*deltaTimeStdDev));
}
List<Feature> singlyMatchedMS1Features = new ArrayList<Feature>();
for (Feature feature : ms1MS2MatchingResult.getMasterSetFeatures())
{
List<Feature> ms2MatchedFeatures = ms1MS2MatchingResult.getSlaveSetFeatures(feature);
Set<String> peptides = new HashSet<String>();
for (Feature ms2MatchedFeature : ms2MatchedFeatures)
peptides.add(MS2ExtraInfoDef.getFirstPeptide(ms2MatchedFeature));
if (peptides.size() == 1)
{
Feature ms2MatchedFeature = ms2MatchedFeatures.get(0);
MS2ExtraInfoDef.setSinglePeptide(feature, peptides.iterator().next());
MS2ExtraInfoDef.setModifiedAminoAcids(feature, MS2ExtraInfoDef.getModifiedAminoAcids(ms2MatchedFeature));
MS2ExtraInfoDef.setPeptideProphet(feature, MS2ExtraInfoDef.getPeptideProphet(ms2MatchedFeature));
singlyMatchedMS1Features.add(feature);
}
}
ms1FeatureSet.setFeatures(singlyMatchedMS1Features.toArray(new Feature[singlyMatchedMS1Features.size()]));
//set featureset modifications
MS2ExtraInfoDef.setFeatureSetModifications(ms1FeatureSet,
MS2ExtraInfoDef.getFeatureSetModifications(ms2FeatureSet));
_log.debug("MS2 features: " + ms2FeatureSet.getFeatures().length + ", MS1 features: " +
numTotalMS1Features + ", total matched: " + ms1MS2MatchingResult.size() +
", singly matched: " + singlyMatchedMS1Features.size() );
}
/**
* workinprogress
*
* Add a run to an AMT database.
*
* After linear H calculation, if we have significant agreeing peptide overlap (at least
* minMatchedPeptides that match database entries with at least minDBObservationsForAlignmentMatch
* observations, within maxHDiffForAlignmentMatch H units) then use those agreeing
* peptides as points for alignment, and set all the rest of the H values according
* to that alignment.
*
* @param amtDatabase
* @param newRunFeatureSet
* @param scanOrTimeMode
* @param robustRegression
* @param maxStudentizedResidualForRegression
* @param maxStudentizedResidualForInclusion
* @param maxHDiffForAlignmentMatch
* @param minMatchedPeptidesForAlignment
*/
public void addRunToAmtDatabase(AmtDatabase amtDatabase, FeatureSet newRunFeatureSet,
FeatureSet ms1FeatureSet,
int scanOrTimeMode,
boolean robustRegression,
double maxStudentizedResidualForRegression,
double maxStudentizedResidualForInclusion,
double minDBObservationsForAlignmentMatch,
double maxHDiffForAlignmentMatch,
int minMatchedPeptidesForAlignment)
{
Pair<Integer, Integer> outNumFeaturesChosen = new Pair<Integer, Integer>(0,0);
AmtDatabase thisRunDB = createAmtDatabaseForRun(
newRunFeatureSet,
ms1FeatureSet,
scanOrTimeMode,
robustRegression,
outNumFeaturesChosen,
maxStudentizedResidualForRegression,
maxStudentizedResidualForInclusion);
amtDatabase.addObservationsFromAnotherDatabase(thisRunDB);
}
/**
* Create an AMT database from an all.pep.xml file -- pass null
* to the workhorse method so that it knows we need to look in the file
* from a reference to the mzXML file
* @param allPepXmlFile
* @param scanOrTimeMode
* @param featureSelector
* @param robustRegression
* @return
* @throws Exception
*/
public AmtDatabase createAmtDBFromAllPepXml(File allPepXmlFile,
File ms1Dir,
int scanOrTimeMode,
FeatureSet.FeatureSelector featureSelector,
boolean robustRegression,
double maxStudentizedResidualForRegression,
double maxStudentizedResidualForInclusion)
throws Exception
{
return createAmtDBFromPepXml(allPepXmlFile, ms1Dir, null, scanOrTimeMode,
featureSelector, robustRegression,
maxStudentizedResidualForRegression,
maxStudentizedResidualForInclusion);
}
/**
* Create an AMT database from a pep.xml file.
*
* If no mzXml file is specified, look in the pepXml fractions themselves
* to find mzXML files
*
* Warning:
* Since PepXmlLoader runs through the pepXml file sequentially, we have to
* load the databases one at a time... so if an mzXml file is missing
* at the very end, we won't find it until we've done most of the work.
*
* TODO: currently this will fail if you're on an OS that uses a different
* file separator character from the one the all.pep.xml file was created on,
* because fraction.getSpectrumPath() will have the wrong characters. Should be converted
* @param allPepXmlFile
* @param scanOrTimeMode
* @return
* @throws Exception
*/
public AmtDatabase createAmtDBFromPepXml(File allPepXmlFile,
File ms1Dir,
MSRun run,
int scanOrTimeMode,
FeatureSet.FeatureSelector featureSelector,
boolean robustRegression,
double maxStudentizedResidualForRegression,
double maxStudentizedResidualForInclusion)
throws Exception
{
AmtDatabase resultDB = new AmtDatabase();
PepXmlLoader pepXmlLoader = new PepXmlLoader(allPepXmlFile, _log);
PepXmlLoader.FractionIterator fractionIterator =
pepXmlLoader.getFractionIterator();
File pepXmlFileDir = allPepXmlFile.getParentFile();
boolean shouldLoadRuns =
(run == null && scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_TIME);
int numFeaturesChosenForRegression=0;
int numFeaturesChosenForInclusion=0;
Pair<Integer,Integer> numFeaturesDummy = new Pair<Integer,Integer>(0,0);
int numFeaturesEvaluated = 0;
//for each fraction, create a featureset and locate the mzXml file,
//create an amt database, and add all the observations, runs and mods
PepXMLFeatureFileHandler pepXmlFileHandler = new PepXMLFeatureFileHandler();
while (fractionIterator.hasNext())
{
PepXmlLoader.PepXmlFraction fraction =
(PepXmlLoader.PepXmlFraction) fractionIterator.next();
_log.debug("fraction " + fraction.getDataBasename());
FeatureSet fractionFeatureSet =
pepXmlFileHandler.createFeatureSetFromPepXMLFraction(fraction, pepXmlLoader);
fractionFeatureSet = fractionFeatureSet.filter(featureSelector);
//remove all but the first occurrence of each peptide
MS2ExtraInfoDef.setFeatureSetModifications(fractionFeatureSet,
fraction.getModifications().toArray(new MS2Modification[0]));
if (shouldLoadRuns)
{
File mzXmlFile = new File(pepXmlFileDir.getAbsolutePath() +
File.separatorChar +
fraction.getSpectrumPath());
_log.debug("mzxml file: " + mzXmlFile.getAbsolutePath());
run = MSRun.load(mzXmlFile.getAbsolutePath());
}
if (scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_TIME)
fractionFeatureSet.populateTimesForMS2Features(run);
numFeaturesEvaluated += fractionFeatureSet.getFeatures().length;
FeatureSet ms1FeatureSet = null;
if (ms1Dir != null)
{
File ms1File = CommandLineModuleUtilities.findFileWithPrefix(fraction.getDataBasename(), ms1Dir,"tsv");
ms1FeatureSet = new FeatureSet(ms1File);
}
AmtDatabase thisRunDB =
createAmtDatabaseForRun(fractionFeatureSet,ms1FeatureSet,
scanOrTimeMode, robustRegression, numFeaturesDummy,
maxStudentizedResidualForRegression,
maxStudentizedResidualForInclusion);
_log.debug("Created fraction database: " + thisRunDB);
numFeaturesChosenForRegression += numFeaturesDummy.first;
numFeaturesChosenForInclusion += numFeaturesDummy.second;
resultDB.addObservationsFromAnotherDatabase(thisRunDB);
thisRunDB.getRunBySequence(1).setMzXmlFilename(run.getFileName());
thisRunDB.getRunBySequence(1).setPepXmlFilename(fraction.getDataBasename() + ".pep.xml");
}
_log.debug("Features chosen for final regression: " + numFeaturesChosenForRegression);
_log.debug("Features chosen for inclusion: " + numFeaturesChosenForInclusion);
_log.debug("Total features evaluated: " + numFeaturesEvaluated);
_log.debug("Resulting database: " + resultDB);
return resultDB;
}
/**
* Create a feature set based on this AMT database, mapped to a particular run.
* Make a BUNCH of assumptions.
* @param run
* @param regressionMS2FeatureSet
* @return
*/
public static FeatureSet buildFeatureSet(AmtDatabase amtDB,
MSRun run, FeatureSet regressionMS2FeatureSet,
int chargeForAllFeatures, double minPeptideProphet,
boolean robustRegression)
{
ArrayList<Feature> resultFeaturesList = new ArrayList<Feature>();
FeatureSet.FeatureSelector sel = new FeatureSet.FeatureSelector();
sel.setMinPProphet((float)minPeptideProphet);
regressionMS2FeatureSet = regressionMS2FeatureSet.filter(sel);
regressionMS2FeatureSet.populateTimesForMS2Features(run);
Map<String,Double> regressionMap =
AmtUtilities.calculateHydrophobicityScanOrTimeRelationship(
regressionMS2FeatureSet.getFeatures(),
ProteomicsRegressionUtilities.REGRESSION_MODE_TIME,
robustRegression);
double slope = regressionMap.get(RegressionUtilities.REGRESSION_SLOPE_KEY);
double intercept = regressionMap.get(RegressionUtilities.REGRESSION_INTERCEPT_KEY);
for (AmtPeptideEntry peptideEntry : amtDB.getEntries())
{
int scan = (int) AmtUtilities.predictScanOrTime(slope, intercept,
peptideEntry.getMedianObservedHydrophobicity());
Feature newFeature =
MS2ExtraInfoDef.createMS2Feature(scan, (float) peptideEntry.getMass(), chargeForAllFeatures,
peptideEntry.getPeptideSequence(), null, null);
AmtExtraInfoDef.setObservedHydrophobicity(newFeature,peptideEntry.getMedianObservedHydrophobicity());
//fake support for our fake charge
if (chargeForAllFeatures>1)
newFeature.setPeaks(2);
//fake intensity
newFeature.setIntensity(200);
resultFeaturesList.add(newFeature);
}
return new FeatureSet(resultFeaturesList.toArray(new Feature[0]));
}
public float getMs1Ms2MassTolerancePPM()
{
return ms1Ms2MassTolerancePPM;
}
public void setMs1Ms2MassTolerancePPM(float ms1Ms2MassTolerancePPM)
{
this.ms1Ms2MassTolerancePPM = ms1Ms2MassTolerancePPM;
}
public float getMs1Ms2TimeToleranceSeconds()
{
return ms1Ms2TimeToleranceSeconds;
}
public void setMs1Ms2TimeToleranceSeconds(float ms1Ms2TimeToleranceSeconds)
{
this.ms1Ms2TimeToleranceSeconds = ms1Ms2TimeToleranceSeconds;
}
public boolean isIgnoreUnknownModifications()
{
return ignoreUnknownModifications;
}
public void setIgnoreUnknownModifications(boolean ignoreUnknownModifications)
{
this.ignoreUnknownModifications = ignoreUnknownModifications;
}
}