/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.viewer.metabologna;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.Spectrum;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet;
import org.fhcrc.cpl.toolbox.proteomics.feature.matching.FeatureSetMatcher;
import org.fhcrc.cpl.toolbox.proteomics.MassUtilities;
import org.fhcrc.cpl.toolbox.chem.*;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithScatterPlot;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithHistogram;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithBlindImageChart;
import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities;
import org.fhcrc.cpl.toolbox.statistics.BasicStatistics;
import org.fhcrc.cpl.toolbox.statistics.RInterface;
import org.fhcrc.cpl.toolbox.ApplicationContext;
import org.fhcrc.cpl.toolbox.Rounder;
import org.fhcrc.cpl.toolbox.commandline.CommandLineModuleExecutionException;
import org.fhcrc.cpl.toolbox.datastructure.Pair;
import org.fhcrc.cpl.toolbox.filehandler.TempFileManager;
import org.apache.log4j.Logger;
import java.util.*;
import java.io.IOException;
import java.io.File;
/**
* If multiple Adducts have the same chemical formula, and some of them have modifications, any modified Adducts
* with that chemical formula will be removed from matching results.
*
* Change feature masses to be mz * charge, rather than (mz - H) * charge
*/
public class MetaboliteDatabaseMatcher
{
protected static Logger _log = Logger.getLogger(MetaboliteDatabaseMatcher.class);
public static final float DEFAULT_INITIAL_MATCH_PROPORTION = 0.1f;
public static final float DEFAULT_CALIBRATION_LOOSE_TOLERANCE_PPM = 5;
public static final float DEFAULT_CALIBRATION_MAX_STUD_RES = 1;
public static final float DEFAULT_CALIBRATION_MAX_LEVERAGE = 6;
public static final int DEFAULT_CALIBRATION_REGRESSION_DEGREES_FREEDOM = 1;
protected float calMassTolerancePPM = DEFAULT_CALIBRATION_LOOSE_TOLERANCE_PPM;
protected float calMaxLeverage = DEFAULT_CALIBRATION_MAX_LEVERAGE;
protected double calMaxStudRes = DEFAULT_CALIBRATION_MAX_STUD_RES;
protected int calRegressionDoF = DEFAULT_CALIBRATION_REGRESSION_DEGREES_FREEDOM;
protected Map<ChemicalFormula, List<Adduct>> allFormulaAdductsMap;
protected List<ChemicalCompound> databaseCompounds = null;
protected boolean showCharts = false;
protected boolean useUnmodifiedAdduct = true;
//When we're calculating the ppm error that contains 'most' of the points in the 2-peak error distribution,
//how many standard deviations is 'most'?
protected float sigmaMultipleForErrorDist = 2;
//Chemical modifications to apply. Adducts will be created with /one/ of these and with /none/ of these
//todo: implment rules about applying different modifications to types of adducts
protected List<ChemicalModification> chemicalModifications = new ArrayList<ChemicalModification>();
/**
* Return mass calibration coefficients.
* Also, do the calibration.
* @param features
* @return calibration coefficients
* @throws IOException
*/
public double[] calibrateFeatureMasses(Feature[] features) throws IOException
{
_log.debug("calibrating feature masses. " + features.length + " features, mass tolerance: " +
calMassTolerancePPM + "ppm");
//First by time, then by mass
Map<Feature, Map<ChemicalFormula, List<Adduct>>> featureMassMatchMap =
massMatchFull(features, calMassTolerancePPM, 1);
List<Float> origMasses = new ArrayList<Float>();
_log.debug(featureMassMatchMap.size() + " features matched");
/*
//this code was used for calibrating feature masses by retention time. I think this was misguided, although
//it seemed to do something useful.
double[] rtCalibrationCoefficients = null;
if (calWithRTFirst) {
_log.debug("Calibrating using RT");
if (showCharts)
{
showMassDeltaMassChart(featureMassMatchMap, "DeltaMassVsMass_beforeRTCal");
}
rtCalibrationCoefficients = calcMassCalibrationUsingMatches(featureMassMatchMap, true, true);
if (_log.isDebugEnabled())
{
System.err.print("RT calibration coefficients: ");
for (double coeff : rtCalibrationCoefficients)
System.err.print(" " + coeff);
System.err.println();
}
List<Float> featureAbsMassChangesPPM = new ArrayList<Float>();
for (Feature feature : features)
{
float featureMass = feature.getMz() * feature.getCharge();
origMasses.add(featureMass);
float massChangePPM = (float) RegressionUtilities.mapValueUsingCoefficients(rtCalibrationCoefficients,
feature.getTime());
float massChangeDa = MassUtilities.convertPPMToDa(massChangePPM, featureMass);
float newMass = featureMass - massChangeDa;
featureAbsMassChangesPPM.add(Math.abs(massChangePPM));
feature.setMass(newMass);
feature.setMz(newMass / feature.getCharge());
if (feature.comprised != null)
{
for (Spectrum.Peak peak : feature.comprised)
{
if (peak == null)
continue;
peak.setMz(peak.mz - (massChangeDa / Math.abs(feature.charge)));
//System.err.println("old=" + oldPeakMass + ", new=" + newPeakMass + ", ppmdiff: " + MassUtilities.convertDaToPPM(newPeakMass-oldPeakMass, oldPeakMass));
}
}
}
_log.debug("Mean absolute mass change by RT: " + Rounder.round(BasicStatistics.mean(featureAbsMassChangesPPM),1) + "ppm");
}
*/
_log.debug("Calibrating by mass");
featureMassMatchMap =
massMatchFull(features, calMassTolerancePPM, 1);
if (showCharts)
{
showMassDeltaMassChart(featureMassMatchMap, "DeltaMassVsMass_beforeMassCal" );
}
double[] calibrationCoefficientsByMass = calcMassCalibrationUsingMatches(featureMassMatchMap, false, false);
if (_log.isDebugEnabled())
{
System.err.print("Mass calibration coefficients: ");
for (double coeff : calibrationCoefficientsByMass)
System.err.print(" " + coeff);
System.err.println();
}
List<Float> featureAbsMassChangesByMassPPM = new ArrayList<Float>();
for (Feature feature : features)
{
//paranoia -- just in case feature.getMass() hasn't been corrected for non-peptide use
float featureMass = feature.getMz() * feature.getCharge();
float massDiffDa = (float) RegressionUtilities.mapValueUsingCoefficients(calibrationCoefficientsByMass,
featureMass);
float newMass = featureMass - massDiffDa;
featureAbsMassChangesByMassPPM.add(MassUtilities.convertDaToPPM(Math.abs(massDiffDa), featureMass));
feature.setMass(newMass);
feature.setMz(newMass / feature.getCharge());
if (feature.comprised != null)
{
for (Spectrum.Peak peak : feature.comprised)
{
if (peak == null)
continue;
float peakMass = peak.mz * feature.getCharge();
float massDiffDaThisPeak = (float) RegressionUtilities.mapValueUsingCoefficients(
calibrationCoefficientsByMass,
peakMass);
peak.setMz(peak.mz - (massDiffDaThisPeak/Math.abs(feature.charge)));
//System.err.println("old=" + oldPeakMass + ", new=" + newPeakMass + ", ppmdiff: " + MassUtilities.convertDaToPPM(newPeakMass-oldPeakMass, oldPeakMass));
}
}
}
_log.debug("Mean absolute mass change by mass: " + Rounder.round(BasicStatistics.mean(featureAbsMassChangesByMassPPM),1) + "ppm");
if (_log.isDebugEnabled())
{
List<Float> totalMassChangesPPM = new ArrayList<Float>();
for (int i=0; i<features.length; i++)
totalMassChangesPPM.add(MassUtilities.convertDaToPPM(features[i].mass - origMasses.get(i), features[i].mass));
_log.debug("Mean absolute mass change with both calibrations: " + Rounder.round(BasicStatistics.mean(totalMassChangesPPM),1) + "ppm");
if (showCharts)
{
new PanelWithHistogram(totalMassChangesPPM, "Cal PPM Changes").displayInTab();
}
}
featureMassMatchMap =
massMatchFull(features, getOrCreateFormulaAdductsMap(), calMassTolerancePPM, 1);
if (showCharts)
{
showMassDeltaMassChart(featureMassMatchMap, "DeltaMassVsMass_afterMassCal");
}
return calibrationCoefficientsByMass;
}
protected void showMassDeltaMassChart(Map<Feature, Map<ChemicalFormula, List<Adduct>>> featureMassMatchMap,
String name)
{
List<Double> masses = new ArrayList<Double>();
List<Double> deltaMassesPPM = new ArrayList<Double>();
for (Feature feature : featureMassMatchMap.keySet())
{
float featureMass = feature.getMz() * feature.getCharge();
for (ChemicalFormula formula : featureMassMatchMap.get(feature).keySet())
{
masses.add(formula.getCommonestIsotopeMass());
deltaMassesPPM.add((double)MassUtilities.convertDaToPPM(featureMass -
(float) formula.getCommonestIsotopeMass(), (float)formula.getCommonestIsotopeMass()));
}
}
new PanelWithScatterPlot(masses, deltaMassesPPM,name ).displayInTab();
}
protected double[] calcMassCalibrationUsingMatches(Map<Feature, Map<ChemicalFormula, List<Adduct>>> featureMassMatchMap,
boolean byTime, boolean usePPM) throws IOException
{
List<Double> massDiffsListPreCal = new ArrayList<Double>();
List<Double> featureXvalListPreCal = new ArrayList<Double>();
String chartSuffix = byTime ? "_RT" : "_MASS";
chartSuffix = chartSuffix + (usePPM ? "_PPM" : "DA");
String xAxisLabel = byTime ? "time" : "mass";
for (Feature feature : featureMassMatchMap.keySet())
{
float featureMass = feature.getMz() * feature.getCharge();
for (ChemicalFormula formula : featureMassMatchMap.get(feature).keySet())
{
double deltaMassDa = featureMass - formula.getCommonestIsotopeMass();
massDiffsListPreCal.add(usePPM ?
MassUtilities.convertDaToPPM((float)deltaMassDa, (float) formula.getCommonestIsotopeMass()) : deltaMassDa);
featureXvalListPreCal.add(byTime ? (double) feature.getTime() : featureMass);
}
}
double[] featureXvals = new double[featureXvalListPreCal.size()];
double[] massDiffs = new double[massDiffsListPreCal.size()];
for (int i=0; i<featureXvals.length; i++)
{
featureXvals[i] = featureXvalListPreCal.get(i);
massDiffs[i] = massDiffsListPreCal.get(i);
}
int[] indexesLowStuff = RegressionUtilities.selectIndexesWithLowLeverageAndStudentizedResidual(
featureXvals, massDiffs, calMaxLeverage,
calMaxStudRes, false, 1, false, false);
double[] featureXvalsForRegression = new double[indexesLowStuff.length];
double[] massDiffsForRegression = new double[indexesLowStuff.length];
for (int i=0; i<indexesLowStuff.length; i++)
{
featureXvalsForRegression[i] = featureXvals[indexesLowStuff[i]];
massDiffsForRegression[i] = massDiffs[indexesLowStuff[i]];
}
double[] resultCoefficients = null;
resultCoefficients = RegressionUtilities.modalRegression(featureXvalsForRegression,
massDiffsForRegression, calRegressionDoF);
if (showCharts)
{
double minXval = BasicStatistics.min(featureXvals);
double maxXval = BasicStatistics.max(featureXvals);
PanelWithScatterPlot psp = new PanelWithScatterPlot();
String chartName = "MassCal" + chartSuffix;
psp.setName(chartName);
psp.addLineOrCurve(resultCoefficients, minXval, maxXval);
psp.addData(featureXvalsForRegression, massDiffsForRegression,
"Matches used in regression");
psp.addData(featureXvals, massDiffs, "all mass matches");
String massDiffLabel = "mass difference" + (usePPM ? " (ppm)" : " (da)");
psp.setAxisLabels(xAxisLabel,massDiffLabel);
psp.displayInTab();
}
return resultCoefficients;
}
public Map<Feature, Map<ChemicalFormula, List<Adduct>>> massMatchBestOnly(Feature[] features,
Map<ChemicalFormula, List<Adduct>> formulaAdductsMap,
float massTolPPM)
{
Map<Feature, Map<ChemicalFormula, List<Adduct>>> result =
massMatchFull(features, formulaAdductsMap, massTolPPM, 1);
for (Feature feature : result.keySet())
{
double smallestDeltaMassPPM = Double.MAX_VALUE;
ChemicalFormula bestMatch = null;
for (ChemicalFormula formula : result.get(feature).keySet())
{
double deltaMassPPM = MassUtilities.convertDaToPPM(
(float)(feature.mass - formula.getCommonestIsotopeMass()), feature.mass);
if (Math.abs(deltaMassPPM) < smallestDeltaMassPPM)
{
smallestDeltaMassPPM = deltaMassPPM;
bestMatch = formula;
}
}
Map<ChemicalFormula, List<Adduct>> bestMap = new HashMap<ChemicalFormula, List<Adduct>>();
bestMap.put(bestMatch, result.get(feature).get(bestMatch));
result.put(feature, bestMap);
}
return result;
}
protected Map<ChemicalFormula, List<Adduct>> getOrCreateFormulaAdductsMap()
{
if (allFormulaAdductsMap == null)
{
System.err.println("Generating formula->adducts map");
allFormulaAdductsMap = createFormulaAdductsMap(databaseCompounds);
//for (List<Adduct> adductList : allFormulaAdductsMap.values()) { for (Adduct adduct : adductList) System.err.println(
// adduct.getCompoundNameAndIonTypeString() + ", mass=" + adduct.getCommonestIsotopeMass()); }
}
return allFormulaAdductsMap;
}
/**
* Apply all chemical modifications to all compounds to which they can be performed, and assemble
* the results into a map from formulas to lists of adducts with that formula.
*
* After generating full map, clean it up. For each chemical formula, if there are multiple
* adducts with the same SMILES string, remove any that have modifications
* @param compounds
* @return
*/
protected Map<ChemicalFormula, List<Adduct>> createFormulaAdductsMap(List<ChemicalCompound> compounds)
{
Map<String, ChemicalFormula> formulaStringFormulaMap = new HashMap<String, ChemicalFormula>();
Map<ChemicalFormula, List<Adduct>> allFormulaAdductsMap = new HashMap<ChemicalFormula, List<Adduct>>();
Map<ChemicalModification, List<ChemicalCompound>> modCompoundsMap =
new HashMap<ChemicalModification, List<ChemicalCompound>>();
_log.debug("Creating formula-adducts map for " + compounds.size() + " compounds.");
if (chemicalModifications.isEmpty())
_log.debug("\tNo modifications to compounds");
else
{
_log.debug("\tModifications to compounds: ");
for (ChemicalModification mod : chemicalModifications)
{
_log.debug("\t\t" + mod.getName());
}
}
for (int i=0; i<compounds.size(); i++)
{
ChemicalCompound compound = compounds.get(i);
if (i % (compounds.size() / 20) == 0)
_log.debug("Created adducts for " + i + " compounds...");
List<Adduct> adductsThisCompound = new ArrayList<Adduct>();
Adduct baseAdduct = new Adduct(compound);
if (useUnmodifiedAdduct) {
//add the unmodified mass
adductsThisCompound.add(new Adduct(baseAdduct));
}
if (!chemicalModifications.isEmpty())
{
for (ChemicalModification mod : chemicalModifications)
{
if (mod.canPerform(baseAdduct))
{
Adduct modAdduct = new Adduct(compound);
mod.perform(modAdduct);
adductsThisCompound.add(modAdduct);
//System.err.println(modAdduct.getIonTypeString());
//System.err.println(new SmilesGenerator().createSMILES(compound.getCdkMolecule()));
//accounting
List<ChemicalCompound> compoundsThisMod = modCompoundsMap.get(mod);
if (compoundsThisMod == null)
{
compoundsThisMod = new ArrayList<ChemicalCompound>();
modCompoundsMap.put(mod, compoundsThisMod);
}
compoundsThisMod.add(compound);
}
}
}
for (Adduct adduct : adductsThisCompound)
{
ChemicalFormula formula = adduct.getFormula();
if (formulaStringFormulaMap.containsKey(formula.toString()))
formula = formulaStringFormulaMap.get(formula.toString());
else formulaStringFormulaMap.put(formula.toString(), formula);
//if (uniqueFormulaStrings.contains(formula.toString()))
// System.err.println("DUPE!!!! " + formula.toString());
//uniqueFormulaStrings.add(formula.toString());
List<Adduct> adductsThisFormula = allFormulaAdductsMap.get(formula);
if (adductsThisFormula == null)
{
adductsThisFormula = new ArrayList<Adduct>();
allFormulaAdductsMap.put(formula, adductsThisFormula);
}
adductsThisFormula.add(adduct);
}
}
//todo: this is expensive but useful. Need to put this back in if possible
if (!chemicalModifications.isEmpty())
{
_log.debug("******NOT CHECKING FOR MODS WITH DUPE STRUCTURE!!!! TOO EXPENSIVE!! PUT BACK IN IF POSSIBLE****");
// _log.debug("Checking for duplicate SMILES strings...");
// //Remove modified duplicate adducts
// for (List<Adduct> adductsForAFormula : allFormulaAdductsMap.values())
// removeModifiedWhenDuplicateSMILES(adductsForAFormula);
}
if (!chemicalModifications.isEmpty())
{
ApplicationContext.infoMessage("Modifications and numbers of compounds applied to (out of " +
compounds.size() + "):");
for (ChemicalModification mod : chemicalModifications)
{
int numComps = 0;
if (modCompoundsMap.containsKey(mod))
numComps = modCompoundsMap.get(mod).size();
ApplicationContext.infoMessage(mod.getName() + ": " + numComps);
}
}
_log.debug("Created adducts for all compounds");
return allFormulaAdductsMap;
}
/**
* Remove any modified Adducts for which an unmodified Adduct exists with the same SMILES string
* @param adducts
*/
public void removeModifiedWhenDuplicateSMILES(List<Adduct> adducts)
{
List<String> unmodifiedSMILESStrings = new ArrayList<String>();
List<Adduct> modifiedAdducts = new ArrayList<Adduct>();
for (Adduct adduct : adducts)
{
if (adduct.getModifications() == null || adduct.getModifications().isEmpty())
unmodifiedSMILESStrings.add(ChemCalcs.createSMILESString(adduct.getMolecule()));
else
modifiedAdducts.add(adduct);
}
for (Adduct adduct : modifiedAdducts)
{
if (unmodifiedSMILESStrings.contains(ChemCalcs.createSMILESString(adduct.getMolecule())))
adducts.remove(adduct);
}
}
/**
* This one uses the default formula-adducts map
* @param featuresSortedMassAsc
* @param massTolPPM
* @param numPeaksMustMatch
* @return
*/
public Map<Feature, Map<ChemicalFormula, List<Adduct>>> massMatchFull(Feature[] featuresSortedMassAsc,
float massTolPPM, int numPeaksMustMatch)
{
return massMatchFull(featuresSortedMassAsc, getOrCreateFormulaAdductsMap(), massTolPPM, numPeaksMustMatch);
}
/**
* This version takes a map from formulas to adducts
* @param featuresSortedMassAsc
* @param massTolPPM
* @param numPeaksMustMatch at least numPeaksMustMatch peaks ALL must match within tolerance
* @return
*/
public Map<Feature, Map<ChemicalFormula, List<Adduct>>> massMatchFull(Feature[] featuresSortedMassAsc,
Map<ChemicalFormula, List<Adduct>> formulaAdductsMap,
float massTolPPM, int numPeaksMustMatch)
{
//make absolutely sure feature masses are fixed.
for (Feature feature : featuresSortedMassAsc)
feature.setMass(feature.getMz() * feature.getCharge());
List<ChemicalFormula> formulasByMassAsc = new ArrayList<ChemicalFormula>(allFormulaAdductsMap.keySet());
Collections.sort(formulasByMassAsc, new ChemicalFormula.ComparatorMassAsc());
_log.debug("Unique chemical formulas in database, with adducts: " + formulasByMassAsc.size());
if (_log.isDebugEnabled())
{
List<Integer> adductCountsByFormula = new ArrayList<Integer>();
for (List<Adduct> formulas : formulaAdductsMap.values())
adductCountsByFormula.add(formulas.size());
_log.debug("Mean adducts per chemical formula: " + BasicStatistics.mean(adductCountsByFormula));
// if (showCharts)
// new PanelWithHistogram(adductCountsByFormula, "# of Adducts Per Formula").displayInTab();
}
int minPossibleMassIndex = 0;
Map<Feature, Map<ChemicalFormula, List<Adduct>>> result =
new HashMap<Feature, Map<ChemicalFormula, List<Adduct>>>();
boolean newFeature = true;
for (Feature feature :featuresSortedMassAsc)
{
//not using calculated feature mass, because that presumes M+H and subtracts the H
float featureMass = feature.getMz() * feature.getCharge();
float massToleranceDa = MassUtilities.calculateAbsoluteDeltaMass(featureMass, massTolPPM,
FeatureSetMatcher.DELTA_MASS_TYPE_PPM);
float minMatchMass = featureMass - massToleranceDa;
float maxMatchMass = featureMass + massToleranceDa;
// while ((formulasByMassAsc.get(minPossibleMassIndex).getCommonestIsotopeMass() + 1.0078125) < minMatchMass &&
while (formulasByMassAsc.get(minPossibleMassIndex).getCommonestIsotopeMass() < minMatchMass &&
minPossibleMassIndex < formulasByMassAsc.size()-1)
{
minPossibleMassIndex++;
}
List<ChemicalFormula> formulasMatchedThisFeature = new ArrayList<ChemicalFormula>();
for (int i=minPossibleMassIndex ; i< formulasByMassAsc.size(); i++)
{
ChemicalFormula formula = formulasByMassAsc.get(i);
float compoundMass = (float) formula.getCommonestIsotopeMass();
if (compoundMass < minMatchMass)
continue;
else if (maxMatchMass < compoundMass)
break;
else
{
if (newFeature)
{
newFeature = false;
}
if (numPeaksMustMatch > 1)
{
float minPeaksFeatureOrCompound = Math.min(feature.comprised.length,
formula.getPeakMasses().length);
if (minPeaksFeatureOrCompound < numPeaksMustMatch)
continue;
int numMatched = 1;
for (int peakInd = 1; peakInd < minPeaksFeatureOrCompound; peakInd++)
{
float peakMass = Feature.convertMzToMass(feature.comprised[peakInd].mz, feature.charge);
float deltaMassPPM =
MassUtilities.convertDaToPPM((float) formula.getPeakMasses()[peakInd] - peakMass,
peakMass);
if (Math.abs(deltaMassPPM) > massTolPPM)
{
numMatched++;
}
}
if (numMatched >= numPeaksMustMatch)
formulasMatchedThisFeature.add(formula);
}
else
formulasMatchedThisFeature.add(formula);
}
}
if (!formulasMatchedThisFeature.isEmpty())
{
Map<ChemicalFormula, List<Adduct>> resultThisFeature = new HashMap<ChemicalFormula, List<Adduct>>();
for (ChemicalFormula formula : formulasMatchedThisFeature)
resultThisFeature.put(formula, formulaAdductsMap.get(formula));
result.put(feature, resultThisFeature);
}
}
return result;
}
/**
*
* @param featureSet
* @param looseMassTolerancePPM
* @return
* @throws IOException
*/
public double calcRunMassError(FeatureSet featureSet, float looseMassTolerancePPM) throws IOException
{
List<ChemicalCompound> compoundsList = new ArrayList<ChemicalCompound>();
List<ChemicalCompound> compoundsWith2PeaksList = new ArrayList<ChemicalCompound>();
// List<Float> dbAllMassPeakDistances = new ArrayList<Float>();
// List<Float> dbAllMassPeakDistancesPPMDist = new ArrayList<Float>();
// List<Float> dbAllMasses = new ArrayList<Float>();
for (ChemicalCompound compound : databaseCompounds)
{
try
{
compoundsList.add(compound);
if (compound.getPeakMasses().length == 1)
{
_log.debug("Single-peak compound " + compound);
}
else
{
float peakDist = (float)(compound.getPeakMasses()[1] - compound.getPeakMasses()[0]);
if (Math.abs(peakDist) < 1.5f)
{
// dbAllMasses.add((float) compound.getMass());
// dbAllMassPeakDistances.add(peakDist);
// if (Math.abs(MassUtilities.convertDaToPPM(peakDist - 1.0035f, (float) compound.getMass())) < 50)
// dbAllMassPeakDistancesPPMDist.add(MassUtilities.convertDaToPPM(peakDist - 1.0035f, (float) compound.getMass()));
compoundsWith2PeaksList.add(compound);
}
else
{
_log.debug("Weird second-peak mass: " + compound);
}
}
}
catch (IllegalArgumentException e)
{
ApplicationContext.setMessage("Skipping bad compound: " + e.getMessage());
}
}
// if (showCharts)
// {
// new PanelWithHistogram(dbAllMassPeakDistances, "db peak distances").displayInTab();
// new PanelWithHistogram(dbAllMassPeakDistancesPPMDist, "db peak spread ppm").displayInTab();
// new PanelWithScatterPlot(dbAllMasses, dbAllMassPeakDistances, "db mass vs peakdist").displayInTab();
// }
Collections.sort(compoundsWith2PeaksList, new ChemicalCompound.ComparatorMassAsc());
Collections.sort(compoundsList, new ChemicalCompound.ComparatorMassAsc());
Feature[] features = featureSet.getFeatures();
List<Feature> featuresWithMultiplePeaksList = new ArrayList<Feature>();
for (Feature feature : features)
if (feature.peaks > 1)
featuresWithMultiplePeaksList.add(feature);
Feature[] featuresWithMultiplePeaks = featuresWithMultiplePeaksList.toArray(new Feature[featuresWithMultiplePeaksList.size()]);
ApplicationContext.infoMessage("Processing file " + featureSet.getSourceFile().getName() + " with " + features.length +
" features");
Arrays.sort(featuresWithMultiplePeaks, new Feature.MassAscComparator());
//disabling, because this is useless
if (showCharts && false)
{
List<Float> featureMasses = new ArrayList<Float>();
List<Float> featurePeakDistances = new ArrayList<Float>();
for (Feature feature : featuresWithMultiplePeaks)
{
float featureMass = feature.getMz() * feature.getCharge();
featureMasses.add(featureMass);
featurePeakDistances.add(feature.comprised[1].mz - feature.comprised[0].mz);
}
new PanelWithScatterPlot(featureMasses, featurePeakDistances, "feature_mass_peakdist").displayInTab();
}
Map<Feature, Map<ChemicalFormula, List<Adduct>>> featureMassMatchMap =
massMatchBestOnly(featuresWithMultiplePeaks, createFormulaAdductsMap(compoundsWith2PeaksList),
looseMassTolerancePPM);
List<Float> deltaMassPPMList = new ArrayList<Float>();
List<Float> deltaMassPPMSecondPeakList = new ArrayList<Float>();
List<Float> compoundMassesFor2PeakDiffList = new ArrayList<Float>();
List<Float> peakDeltaMassDaDifferencesList = new ArrayList<Float>();
List<Float> peakDeltaMassPPMDifferencesList = new ArrayList<Float>();
List<Float> secondPeakDeltaMassPPMList = new ArrayList<Float>();
for (Feature feature : featureMassMatchMap.keySet())
{
float featureMass = feature.getMz() * feature.getCharge();
Map<ChemicalFormula, List<Adduct>> thisFeatureMatches = featureMassMatchMap.get(feature);
for (ChemicalFormula formula : thisFeatureMatches.keySet())
{
double compoundMass = formula.getCommonestIsotopeMass();
float massCompoundSecondPeak = (float) formula.getPeakMasses()[1];
//if (MassUtilities.convertDaToPPM(Math.abs((float) mass - trackedMass), trackedMass) < 1)
// System.err.println("MATCHED feature mass " + feature.getMass() + " to compound " + compound);
float deltaMassPPM = MassUtilities.convertDaToPPM(featureMass - (float) compoundMass, (float) compoundMass);
deltaMassPPMList.add(deltaMassPPM);
float featureSecondPeakMass = feature.comprised[1].mz / feature.charge;
//System.err.println(mass + ", " + massCompoundSecondPeak + ", " + featureSecondPeakMass);
float deltaMassSecondPeakPPM =
MassUtilities.convertDaToPPM(featureSecondPeakMass - massCompoundSecondPeak,
massCompoundSecondPeak);
deltaMassPPMSecondPeakList.add(deltaMassSecondPeakPPM);
//System.err.println("****" + deltaMassPPM + ", " + deltaMassSecondPeakPPM);
//todo: 30 is arbitrary
if (Math.abs(deltaMassSecondPeakPPM - deltaMassPPM) < 30) {
compoundMassesFor2PeakDiffList.add((float) compoundMass);
peakDeltaMassPPMDifferencesList.add(deltaMassSecondPeakPPM - deltaMassPPM);
peakDeltaMassDaDifferencesList.add(featureSecondPeakMass - featureMass);
secondPeakDeltaMassPPMList.add(deltaMassSecondPeakPPM);
}
}
}
System.err.println("Median mass difference between first and second peak: " + BasicStatistics.median(peakDeltaMassDaDifferencesList));
if (peakDeltaMassPPMDifferencesList.size() < 10)
throw new IOException("Not enough 2-peak matches (" +
peakDeltaMassPPMDifferencesList.size() + ") to analyze distribution");
if (showCharts)
{
new PanelWithHistogram(secondPeakDeltaMassPPMList, "peak2_deltappm").displayInTab();
new PanelWithHistogram(peakDeltaMassPPMDifferencesList, "2peak_diff").displayInTab();
new PanelWithScatterPlot(compoundMassesFor2PeakDiffList, peakDeltaMassPPMDifferencesList, "mass_vs_2peak_diff",
"Compound Mass", "2-peak delta").displayInTab();
}
ApplicationContext.setMessage("Calculating mixed distribution...");
Collections.sort(deltaMassPPMList);
Pair<float[], float[]> paramsAndProbs =
calculateDistParamsAndProbsEM(peakDeltaMassPPMDifferencesList, DEFAULT_INITIAL_MATCH_PROPORTION);
float[] distParams = paramsAndProbs.first;
float[] probabilities = paramsAndProbs.second;
//todo: implicitly assuming mu==0. Should be true after calibration. But I don't think this is actually true
float mu = distParams[0];
float sigma = distParams[1];
float proportion = distParams[2];
float area = distParams[3];
float ppmContainingMostPoints = sigmaMultipleForErrorDist * sigma;
ApplicationContext.setMessage("Done calculating mixed distribution. sigma=" + sigma +
". Most points within " + ppmContainingMostPoints + "ppm");
ApplicationContext.setMessage("Mu: " + mu);
float massErrorPPM = (float) (ppmContainingMostPoints / Math.sqrt(2));
ApplicationContext.setMessage("Run mass error: " + massErrorPPM);
// float parity = 0.5f;
//
// int densParityIndex = Math.abs(Arrays.binarySearch(probabilities, parity));
// float ppmAtParity = deltaMassPPMList.get(densParityIndex);
// _log.debug("Optimizing ppm tolerance, staring with " + ppmAtParity + "ppm");
// double closestProbability = Float.MAX_VALUE;
// float increment = 0.2f;
// while (ppmAtParity > 0)
// {
// double proportionTimesNormdist = proportion * BasicStatistics.calcNormalCumDensity(mu, sigma, ppmAtParity);
// double probability = proportionTimesNormdist / (proportionTimesNormdist + ((1-proportion) / area) );
//System.err.println("PPM: " + ppmAtParity + ", prob: " + probability);
// if (densParityIndex == 0 || probability >= closestProbability)
// break;
// ppmAtParity -= increment;
// }
// float[] fdrs = AmtMatchProbabilityAssigner.calculateFDRsForProbabilities(probabilities);
// new PanelWithHistogram(fdrs, "FDRs").displayInTab();
if (showCharts)
{
List<Float> deltaMassesKeptList = new ArrayList<Float>();
List<Float> deltaMassPPMSecondPeakKeptList = new ArrayList<Float>();
List<Float> deltaMassesSecondUnderList = new ArrayList<Float>();
List<Float> deltaMassPPMSecondPeakSecondUnderList = new ArrayList<Float>();
for (int i=0; i<deltaMassPPMList.size(); i++)
{
if (Math.abs(deltaMassPPMSecondPeakList.get(i)) < massErrorPPM)
{
deltaMassesSecondUnderList.add(deltaMassPPMList.get(i));
deltaMassPPMSecondPeakSecondUnderList.add(deltaMassPPMSecondPeakList.get(i));
if (Math.abs(deltaMassPPMList.get(i)) < massErrorPPM)
{
deltaMassesKeptList.add(deltaMassPPMList.get(i));
deltaMassPPMSecondPeakKeptList.add(deltaMassPPMSecondPeakList.get(i));
}
}
}
PanelWithScatterPlot pwspTwoPeakError =
new PanelWithScatterPlot(deltaMassesKeptList, deltaMassPPMSecondPeakKeptList, "2peakerror");
pwspTwoPeakError.addData(deltaMassesSecondUnderList, deltaMassPPMSecondPeakSecondUnderList, "peak 2 under");
pwspTwoPeakError.addData(deltaMassPPMList, deltaMassPPMSecondPeakList, "all points");
pwspTwoPeakError.setAxisLabels("Peak 1 Delta Mass", "Peak 2 Delta Mass");
pwspTwoPeakError.displayInTab();
PanelWithHistogram pwhDiffs = new PanelWithHistogram(peakDeltaMassPPMDifferencesList, "peak_dist_diff");
pwhDiffs.displayInTab();
float[] peakDeltaMassPPMDifferencesArray = new float[peakDeltaMassPPMDifferencesList.size()];
for (int i=0; i<peakDeltaMassPPMDifferencesArray.length; i++)
peakDeltaMassPPMDifferencesArray[i] = peakDeltaMassPPMDifferencesList.get(i);
PanelWithScatterPlot deltaMassProbPlot = new PanelWithScatterPlot(peakDeltaMassPPMDifferencesArray,
probabilities, "PeakDeltaMassVsProb");
deltaMassProbPlot.setAxisLabels("Peak Pair DeltaMass (PPM)","Probability Matching Same Ion");
deltaMassProbPlot.displayInTab();
}
return massErrorPPM;
}
/**
* Use the Expectation Maximization algorithm to determine the parameters of the target normal distribution
* and the proportion of datapoints in that distribution vs. the uniform background.
* @param targetMassErrorDataList
* @param proportionTrue
* @return a float[] containing mu, sigma, and proportion in the normal dist
* @throws IOException
*/
public Pair<float[],float[]> calculateDistParamsAndProbsEM(List<Float> targetMassErrorDataList,
float proportionTrue)
throws IOException
{
//uniquify masses, so duplicate masses don't mess up distribution
Map<Integer, List<Integer>> uniqueOrigIndexesMap = new HashMap<Integer, List<Integer>>();
List<Float> uniqueMassesList = new ArrayList<Float>();
for (int i=0; i<targetMassErrorDataList.size(); i++)
{
Float mass = targetMassErrorDataList.get(i);
int uniqueIndex = uniqueMassesList.indexOf(mass);
if (uniqueIndex == -1)
{
uniqueMassesList.add(mass);
uniqueIndex = uniqueMassesList.size()-1;
}
List<Integer> indexes = uniqueOrigIndexesMap.get(uniqueIndex);
if (indexes == null)
{
indexes = new ArrayList<Integer>();
uniqueOrigIndexesMap.put(uniqueIndex, indexes);
}
indexes.add(i);
}
if (uniqueMassesList.size() < 10)
throw new IllegalArgumentException("calculateDistParamsAndProbsEM: Too few mass matches (" +
uniqueMassesList.size() + ") to calculate parameters");
double[] uniqueMasses = new double[uniqueOrigIndexesMap.size()];
for (int i=0; i<uniqueMasses.length; i++)
uniqueMasses[i] = uniqueMassesList.get(i);
int numPoints = uniqueMasses.length;
Map<String,Object> rScalarVarMap = new HashMap<String,Object>();
Map<String,double[]> rVectorVarMap = new HashMap<String,double[]>();
rVectorVarMap.put("targetx",uniqueMasses);
_log.debug("calculateDistParamsAndProbsEM, targetx: " + uniqueMasses.length);
rScalarVarMap.put("proportion",(double) proportionTrue);
double area = BasicStatistics.max(uniqueMasses) - BasicStatistics.min(uniqueMasses);
rScalarVarMap.put("area",area);
rScalarVarMap.put("miniterations",(double) 30);
rScalarVarMap.put("maxiterations",(double) 80);
rScalarVarMap.put("max_deltap_proportion_for_stable",(double) .005f);
rScalarVarMap.put("max_deltap_proportion",(double) .005f + 0.001);
rScalarVarMap.put("iters_stable_for_converg",(double) 1);
rScalarVarMap.put("showcharts",showCharts ? "TRUE" : "FALSE");
File outChartFile = null;
File normTestOutChartFile = null;
if (showCharts)
{
outChartFile = TempFileManager.createTempFile("em_plots.jpg", this);
normTestOutChartFile = TempFileManager.createTempFile("em_normtest_plots.jpg", this);
rScalarVarMap.put("chart_out_file", "'" + RInterface.generateRFriendlyPath(outChartFile) + "'");
rScalarVarMap.put("normtest_chart_out_file", "'" +
RInterface.generateRFriendlyPath(normTestOutChartFile) + "'");
}
String calculateProbabilitiesCommand =
RInterface.readResourceFile("/org/fhcrc/cpl/viewer/metabologna/assign_massmatch_probabilities_em.R");
//this timeout is arbitrary and dangerous. Woo hoo!
int timeoutMilliseconds=900000;
String rResult = RInterface.evaluateRExpression(calculateProbabilitiesCommand,
rScalarVarMap, rVectorVarMap, null, null, timeoutMilliseconds);
Map<String, String> varResultStringMap =
RInterface.extractVariableStringsFromListOutput(rResult);
float[] errorProbabilities = new float[numPoints];
int probsIndex = 0;
String[] probChunks = varResultStringMap.get("probs").split("\\s");
for (String probChunk : probChunks)
{
if (!probChunk.contains("[") && probChunk.length() > 0)
{
errorProbabilities[probsIndex++] = Float.parseFloat(probChunk);
}
}
if (probsIndex != numPoints)
throw new IOException("FAILED to read probabilities correctly back from R!");
boolean converged = varResultStringMap.get("converged").contains("TRUE");
String numIterString = varResultStringMap.get("num_iterations");
numIterString = numIterString.substring(numIterString.indexOf("]") + 1).trim();
int num_iterations = Integer.parseInt(numIterString);
if (converged)
{
ApplicationContext.setMessage("EM estimation converged after " +
num_iterations + " iterations");
}
else
{
ApplicationContext.infoMessage("WARNING!!! EM estimation failed to converge after " +
num_iterations + " iterations");
}
String[] ksChunks = varResultStringMap.get("ksresults").split("\\s");
float ks_score_x = Float.parseFloat(ksChunks[2]);
//todo: move parsing of vector results into RInterface
String[] distParamValues = varResultStringMap.get("dist_params").split("\\s");
List<Float> paramVals = new ArrayList<Float>();
for (String paramVal : distParamValues)
if (paramVal != null && paramVal.length() > 1 && !paramVal.contains("["))
paramVals.add(Float.parseFloat(paramVal));
float mu_x = paramVals.get(0);
float sigma_x = paramVals.get(1);
float proportion = paramVals.get(2);
_log.debug("Distribution params: mu_=" + mu_x+
", sigma=" + sigma_x + ", proportion=" + proportion);
if (ks_score_x < 0.005f )
{
_log.debug("WARNING!!!! Kolmogorov-Smirnov test indicates that these matching data " +
"may not conform to a bivariate normal distribution intermingled with a uniform " +
"distribution. If this key assumption fails, match probabilities will not be " +
"accurate. Please consider re-running this analysis in non-parametric mode. " +
"KS p-value = " + ks_score_x);
}
else
{
ApplicationContext.setMessage("KS normality test passed. KS value = " + ks_score_x );
}
if (showCharts)
{
try
{
PanelWithBlindImageChart pwbic =
new PanelWithBlindImageChart(outChartFile,"EM Parameters");
pwbic.displayInTab();
}
catch (Exception e)
{
ApplicationContext.errorMessage("Error displaying error cutoff chart images, file: "
+ outChartFile.getAbsolutePath(),e);
}
try
{
PanelWithBlindImageChart pwbic =
new PanelWithBlindImageChart(normTestOutChartFile,"EM dist analysis");
pwbic.displayInTab();
}
catch (Exception e)
{
ApplicationContext.infoMessage("Error displaying error cutoff chart images, file: "
+ normTestOutChartFile.getAbsolutePath() + ", error: " + e.getMessage());
}
}
TempFileManager.deleteTempFiles(this);
float[] distParams = new float[] { mu_x, sigma_x, proportion, (float) area };
float[] matchProbabilities = new float[targetMassErrorDataList.size()];
for (int i=0; i<errorProbabilities.length; i++)
{
List<Integer> indexes = uniqueOrigIndexesMap.get(i);
for (int matchIndex : indexes)
{
matchProbabilities[matchIndex] = errorProbabilities[i];
}
}
return new Pair<float[], float[]>(distParams, matchProbabilities);
}
public float getCalMassTolerancePPM()
{
return calMassTolerancePPM;
}
public void setCalMassTolerancePPM(float calMassTolerancePPM)
{
this.calMassTolerancePPM = calMassTolerancePPM;
}
public float getCalMaxLeverage()
{
return calMaxLeverage;
}
public void setCalMaxLeverage(float calMaxLeverage)
{
this.calMaxLeverage = calMaxLeverage;
}
public double getCalMaxStudRes()
{
return calMaxStudRes;
}
public void setCalMaxStudRes(double calMaxStudRes)
{
this.calMaxStudRes = calMaxStudRes;
}
public List<ChemicalCompound> getDatabaseCompounds()
{
return databaseCompounds;
}
public void setDatabaseCompounds(List<ChemicalCompound> databaseCompoundsByMass)
{
this.databaseCompounds = databaseCompoundsByMass;
}
public boolean isShowCharts()
{
return showCharts;
}
public void setShowCharts(boolean showCharts)
{
this.showCharts = showCharts;
}
public int getCalRegressionDoF() {
return calRegressionDoF;
}
public void setCalRegressionDoF(int calRegressionDoF) {
this.calRegressionDoF = calRegressionDoF;
}
public List<ChemicalModification> getChemicalModifications() {
return chemicalModifications;
}
public void setChemicalModifications(List<ChemicalModification> chemicalModifications) {
this.chemicalModifications = chemicalModifications;
}
public boolean isUseUnmodifiedAdduct() {
return useUnmodifiedAdduct;
}
public void setUseUnmodifiedAdduct(boolean useUnmodifiedAdduct) {
this.useUnmodifiedAdduct = useUnmodifiedAdduct;
}
}