MetaboliteDatabaseMatcher.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.fhcrc.cpl.viewer.metabologna;

import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.Spectrum;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet;
import org.fhcrc.cpl.toolbox.proteomics.feature.matching.FeatureSetMatcher;
import org.fhcrc.cpl.toolbox.proteomics.MassUtilities;
import org.fhcrc.cpl.toolbox.chem.*;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithScatterPlot;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithHistogram;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithBlindImageChart;
import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities;
import org.fhcrc.cpl.toolbox.statistics.BasicStatistics;
import org.fhcrc.cpl.toolbox.statistics.RInterface;
import org.fhcrc.cpl.toolbox.ApplicationContext;
import org.fhcrc.cpl.toolbox.Rounder;
import org.fhcrc.cpl.toolbox.commandline.CommandLineModuleExecutionException;
import org.fhcrc.cpl.toolbox.datastructure.Pair;
import org.fhcrc.cpl.toolbox.filehandler.TempFileManager;
import org.apache.log4j.Logger;

import java.util.*;
import java.io.IOException;
import java.io.File;

/**
 * If multiple Adducts have the same chemical formula, and some of them have modifications, any modified Adducts
 * with that chemical formula will be removed from matching results.
 *
 * Change feature masses to be mz * charge, rather than (mz - H) * charge
 */
public class MetaboliteDatabaseMatcher
{
    protected static Logger _log = Logger.getLogger(MetaboliteDatabaseMatcher.class);

    public static final float DEFAULT_INITIAL_MATCH_PROPORTION = 0.1f;

    public static final float DEFAULT_CALIBRATION_LOOSE_TOLERANCE_PPM = 5;
    public static final float DEFAULT_CALIBRATION_MAX_STUD_RES = 1;
    public static final float DEFAULT_CALIBRATION_MAX_LEVERAGE = 6;
    public static final int DEFAULT_CALIBRATION_REGRESSION_DEGREES_FREEDOM = 1;

    protected float calMassTolerancePPM = DEFAULT_CALIBRATION_LOOSE_TOLERANCE_PPM;
    protected float calMaxLeverage = DEFAULT_CALIBRATION_MAX_LEVERAGE;
    protected double calMaxStudRes = DEFAULT_CALIBRATION_MAX_STUD_RES;
    protected int calRegressionDoF = DEFAULT_CALIBRATION_REGRESSION_DEGREES_FREEDOM;

    protected Map<ChemicalFormula, List<Adduct>> allFormulaAdductsMap;

    protected List<ChemicalCompound> databaseCompounds = null;
    protected boolean showCharts = false;

    protected boolean useUnmodifiedAdduct = true;

    //When we're calculating the ppm error that contains 'most' of the points in the 2-peak error distribution,
    //how many standard deviations is 'most'?
    protected float sigmaMultipleForErrorDist = 2;

    //Chemical modifications to apply.  Adducts will be created with /one/ of these and with /none/ of these
    //todo: implment rules about applying different modifications to types of adducts
    protected List<ChemicalModification> chemicalModifications = new ArrayList<ChemicalModification>();

    /**
     * Return mass calibration coefficients.
     * Also, do the calibration.
     * @param features
     * @return calibration coefficients
     * @throws IOException
     */
    public double[] calibrateFeatureMasses(Feature[] features) throws IOException
    {
        _log.debug("calibrating feature masses. " + features.length + " features, mass tolerance: " +
                calMassTolerancePPM + "ppm");
        //First by time, then by mass
        Map<Feature, Map<ChemicalFormula, List<Adduct>>> featureMassMatchMap =
                massMatchFull(features, calMassTolerancePPM, 1);
        List<Float> origMasses = new ArrayList<Float>();
        _log.debug(featureMassMatchMap.size() + " features matched");

        /*
        //this code was used for calibrating feature masses by retention time.  I think this was misguided, although
        //it seemed to do something useful.
        double[] rtCalibrationCoefficients = null;
        if (calWithRTFirst) {
            _log.debug("Calibrating using RT");
            if (showCharts)
            {
                showMassDeltaMassChart(featureMassMatchMap, "DeltaMassVsMass_beforeRTCal");
            }
            rtCalibrationCoefficients = calcMassCalibrationUsingMatches(featureMassMatchMap, true, true);

            if (_log.isDebugEnabled())
            {
                System.err.print("RT calibration coefficients: ");
                for (double coeff : rtCalibrationCoefficients)
                    System.err.print(" " + coeff);
                System.err.println();
            }
            List<Float> featureAbsMassChangesPPM = new ArrayList<Float>();
            for (Feature feature : features)
            {
                float featureMass = feature.getMz() * feature.getCharge();
                origMasses.add(featureMass);
                float massChangePPM = (float) RegressionUtilities.mapValueUsingCoefficients(rtCalibrationCoefficients,
                        feature.getTime());
                float massChangeDa = MassUtilities.convertPPMToDa(massChangePPM, featureMass);
                float newMass = featureMass - massChangeDa;
                featureAbsMassChangesPPM.add(Math.abs(massChangePPM));

                feature.setMass(newMass);
                feature.setMz(newMass / feature.getCharge());
                if (feature.comprised != null)
                {
                    for (Spectrum.Peak peak : feature.comprised)
                    {
                        if (peak == null)
                            continue;
                        peak.setMz(peak.mz - (massChangeDa / Math.abs(feature.charge)));
//System.err.println("old=" + oldPeakMass + ", new=" + newPeakMass + ", ppmdiff: " + MassUtilities.convertDaToPPM(newPeakMass-oldPeakMass, oldPeakMass));
                    }
                }
            }
            _log.debug("Mean absolute mass change by RT: " + Rounder.round(BasicStatistics.mean(featureAbsMassChangesPPM),1) + "ppm");
        }
        */

        _log.debug("Calibrating by mass");
        featureMassMatchMap =
                massMatchFull(features, calMassTolerancePPM, 1);
        if (showCharts)
        {
            showMassDeltaMassChart(featureMassMatchMap, "DeltaMassVsMass_beforeMassCal"   );
        }

        double[] calibrationCoefficientsByMass = calcMassCalibrationUsingMatches(featureMassMatchMap, false, false);
        if (_log.isDebugEnabled())
        {
            System.err.print("Mass calibration coefficients: ");
            for (double coeff : calibrationCoefficientsByMass)
                System.err.print(" " + coeff);
            System.err.println();
        }
        List<Float> featureAbsMassChangesByMassPPM = new ArrayList<Float>();
        for (Feature feature : features)
        {
            //paranoia -- just in case feature.getMass() hasn't been corrected for non-peptide use
            float featureMass = feature.getMz() * feature.getCharge();
            float massDiffDa = (float) RegressionUtilities.mapValueUsingCoefficients(calibrationCoefficientsByMass,
                            featureMass);
            float newMass = featureMass - massDiffDa;
            featureAbsMassChangesByMassPPM.add(MassUtilities.convertDaToPPM(Math.abs(massDiffDa), featureMass));
            feature.setMass(newMass);
            feature.setMz(newMass / feature.getCharge());
            if (feature.comprised != null)
            {
                for (Spectrum.Peak peak : feature.comprised)
                {
                    if (peak == null)
                        continue;
                    float peakMass = peak.mz * feature.getCharge();
                    float massDiffDaThisPeak = (float) RegressionUtilities.mapValueUsingCoefficients(
                            calibrationCoefficientsByMass,
                            peakMass);
                    peak.setMz(peak.mz - (massDiffDaThisPeak/Math.abs(feature.charge)));
               
//System.err.println("old=" + oldPeakMass + ", new=" + newPeakMass + ", ppmdiff: " + MassUtilities.convertDaToPPM(newPeakMass-oldPeakMass, oldPeakMass));
                }
            }
        }
        _log.debug("Mean absolute mass change by mass: " + Rounder.round(BasicStatistics.mean(featureAbsMassChangesByMassPPM),1) + "ppm");

        if (_log.isDebugEnabled())
        {
            List<Float> totalMassChangesPPM = new ArrayList<Float>();
            for (int i=0; i<features.length; i++)
                totalMassChangesPPM.add(MassUtilities.convertDaToPPM(features[i].mass - origMasses.get(i), features[i].mass));
            _log.debug("Mean absolute mass change with both calibrations: " + Rounder.round(BasicStatistics.mean(totalMassChangesPPM),1) + "ppm");
            if (showCharts)
            {
                new PanelWithHistogram(totalMassChangesPPM, "Cal PPM Changes").displayInTab();
            }
        }

        featureMassMatchMap =
                massMatchFull(features, getOrCreateFormulaAdductsMap(), calMassTolerancePPM, 1);
        if (showCharts)
        {
            showMassDeltaMassChart(featureMassMatchMap, "DeltaMassVsMass_afterMassCal");

        }

        return calibrationCoefficientsByMass;
    }

    protected void showMassDeltaMassChart(Map<Feature, Map<ChemicalFormula, List<Adduct>>> featureMassMatchMap,
                                          String name)
    {
            List<Double> masses = new ArrayList<Double>();
            List<Double> deltaMassesPPM = new ArrayList<Double>();
            for (Feature feature : featureMassMatchMap.keySet())
            {
                float featureMass = feature.getMz() * feature.getCharge();
                for (ChemicalFormula formula : featureMassMatchMap.get(feature).keySet())
                {
                    masses.add(formula.getCommonestIsotopeMass());
                    deltaMassesPPM.add((double)MassUtilities.convertDaToPPM(featureMass -
                            (float) formula.getCommonestIsotopeMass(), (float)formula.getCommonestIsotopeMass()));
                }
            }
            new PanelWithScatterPlot(masses, deltaMassesPPM,name ).displayInTab();
    }

    protected double[] calcMassCalibrationUsingMatches(Map<Feature, Map<ChemicalFormula, List<Adduct>>> featureMassMatchMap,
                                                       boolean byTime, boolean usePPM) throws IOException
    {
        List<Double> massDiffsListPreCal = new ArrayList<Double>();
        List<Double> featureXvalListPreCal = new ArrayList<Double>();

        String chartSuffix = byTime ? "_RT" : "_MASS";
        chartSuffix = chartSuffix + (usePPM ? "_PPM" : "DA");
        String xAxisLabel = byTime ? "time" : "mass";

        for (Feature feature : featureMassMatchMap.keySet())
        {
            float featureMass = feature.getMz() * feature.getCharge();

            for (ChemicalFormula formula : featureMassMatchMap.get(feature).keySet())
            {
                double deltaMassDa = featureMass - formula.getCommonestIsotopeMass();
                massDiffsListPreCal.add(usePPM ?
                        MassUtilities.convertDaToPPM((float)deltaMassDa, (float) formula.getCommonestIsotopeMass()) : deltaMassDa);
                featureXvalListPreCal.add(byTime ? (double) feature.getTime() : featureMass);
            }
        }
        double[] featureXvals = new double[featureXvalListPreCal.size()];
        double[] massDiffs = new double[massDiffsListPreCal.size()];
        for (int i=0; i<featureXvals.length; i++)
        {
            featureXvals[i] = featureXvalListPreCal.get(i);
            massDiffs[i] = massDiffsListPreCal.get(i);
        }

        int[] indexesLowStuff = RegressionUtilities.selectIndexesWithLowLeverageAndStudentizedResidual(
                featureXvals, massDiffs, calMaxLeverage,
                calMaxStudRes, false, 1, false, false);

        double[] featureXvalsForRegression = new double[indexesLowStuff.length];
        double[] massDiffsForRegression = new double[indexesLowStuff.length];
        for (int i=0; i<indexesLowStuff.length; i++)
        {
            featureXvalsForRegression[i] = featureXvals[indexesLowStuff[i]];
            massDiffsForRegression[i] = massDiffs[indexesLowStuff[i]];
        }
        double[] resultCoefficients = null;
        resultCoefficients = RegressionUtilities.modalRegression(featureXvalsForRegression,
                massDiffsForRegression, calRegressionDoF);

        if (showCharts)
        {
            double minXval = BasicStatistics.min(featureXvals);
            double maxXval = BasicStatistics.max(featureXvals);

            PanelWithScatterPlot psp = new PanelWithScatterPlot();
            String chartName = "MassCal" + chartSuffix;
            psp.setName(chartName);
            psp.addLineOrCurve(resultCoefficients, minXval, maxXval);

            psp.addData(featureXvalsForRegression, massDiffsForRegression,
                    "Matches used in regression");
            psp.addData(featureXvals, massDiffs, "all mass matches");

            String massDiffLabel = "mass difference" + (usePPM ? " (ppm)" : " (da)");
            psp.setAxisLabels(xAxisLabel,massDiffLabel);
            psp.displayInTab();
        }
        return resultCoefficients;
    }

    public Map<Feature, Map<ChemicalFormula, List<Adduct>>> massMatchBestOnly(Feature[] features,
                                                           Map<ChemicalFormula, List<Adduct>> formulaAdductsMap,
                                                           float massTolPPM)
    {
        Map<Feature, Map<ChemicalFormula, List<Adduct>>> result =
                 massMatchFull(features, formulaAdductsMap, massTolPPM, 1);
        for (Feature feature : result.keySet())
        {
            double smallestDeltaMassPPM = Double.MAX_VALUE;
            ChemicalFormula bestMatch = null;
            for (ChemicalFormula formula : result.get(feature).keySet())
            {
                double deltaMassPPM = MassUtilities.convertDaToPPM(
                        (float)(feature.mass - formula.getCommonestIsotopeMass()), feature.mass);
                if (Math.abs(deltaMassPPM) < smallestDeltaMassPPM)
                {
                    smallestDeltaMassPPM = deltaMassPPM;
                    bestMatch = formula;
                }
            }
            Map<ChemicalFormula, List<Adduct>> bestMap = new HashMap<ChemicalFormula, List<Adduct>>();
            bestMap.put(bestMatch, result.get(feature).get(bestMatch));
            result.put(feature, bestMap);
        }
        return result;
    }


    protected Map<ChemicalFormula, List<Adduct>> getOrCreateFormulaAdductsMap()
    {
        if (allFormulaAdductsMap == null)
        {
            System.err.println("Generating formula->adducts map");
            allFormulaAdductsMap = createFormulaAdductsMap(databaseCompounds);
//for (List<Adduct> adductList : allFormulaAdductsMap.values()) { for (Adduct adduct : adductList) System.err.println(
//        adduct.getCompoundNameAndIonTypeString() + ", mass=" + adduct.getCommonestIsotopeMass()); }            
        }
        return allFormulaAdductsMap;
    }


    /**
     * Apply all chemical modifications to all compounds to which they can be performed, and assemble
     * the results into a map from formulas to lists of adducts with that formula.
     *
     * After generating full map, clean it up.  For each chemical formula, if there are multiple
     * adducts with the same SMILES string, remove any that have modifications
     * @param compounds
     * @return
     */
    protected Map<ChemicalFormula, List<Adduct>> createFormulaAdductsMap(List<ChemicalCompound> compounds)
    {
        Map<String, ChemicalFormula> formulaStringFormulaMap = new HashMap<String, ChemicalFormula>();
        Map<ChemicalFormula, List<Adduct>> allFormulaAdductsMap = new HashMap<ChemicalFormula, List<Adduct>>();
        Map<ChemicalModification, List<ChemicalCompound>> modCompoundsMap =
                new HashMap<ChemicalModification, List<ChemicalCompound>>();
        _log.debug("Creating formula-adducts map for " + compounds.size() + " compounds.");
        if (chemicalModifications.isEmpty())
            _log.debug("\tNo modifications to compounds");
        else
        {
            _log.debug("\tModifications to compounds: ");
            for (ChemicalModification mod : chemicalModifications)
            {
                _log.debug("\t\t" + mod.getName());
            }
        }
        for (int i=0; i<compounds.size(); i++)
        {
            ChemicalCompound compound = compounds.get(i);
            if (i % (compounds.size() / 20) == 0)
                   _log.debug("Created adducts for " + i + " compounds...");

            List<Adduct> adductsThisCompound = new ArrayList<Adduct>();

            Adduct baseAdduct = new Adduct(compound);

            if (useUnmodifiedAdduct) {
                //add the unmodified mass
                adductsThisCompound.add(new Adduct(baseAdduct));
            }

            if (!chemicalModifications.isEmpty())
            {
                for (ChemicalModification mod : chemicalModifications)
                {
                    if (mod.canPerform(baseAdduct))
                    {
                        Adduct modAdduct = new Adduct(compound);
                        mod.perform(modAdduct);
                        adductsThisCompound.add(modAdduct);
//System.err.println(modAdduct.getIonTypeString());                        
//System.err.println(new SmilesGenerator().createSMILES(compound.getCdkMolecule()));                        

                        //accounting
                        List<ChemicalCompound> compoundsThisMod = modCompoundsMap.get(mod);
                        if (compoundsThisMod == null)
                        {
                            compoundsThisMod = new ArrayList<ChemicalCompound>();
                            modCompoundsMap.put(mod, compoundsThisMod);
                        }
                        compoundsThisMod.add(compound);
                    }
                }
            }

            for (Adduct adduct : adductsThisCompound)
            {
                ChemicalFormula formula = adduct.getFormula();
                if (formulaStringFormulaMap.containsKey(formula.toString()))
                        formula = formulaStringFormulaMap.get(formula.toString());
                else formulaStringFormulaMap.put(formula.toString(), formula);

//if (uniqueFormulaStrings.contains(formula.toString()))
//    System.err.println("DUPE!!!! " + formula.toString());
//uniqueFormulaStrings.add(formula.toString());

                List<Adduct> adductsThisFormula = allFormulaAdductsMap.get(formula);
                if (adductsThisFormula == null)
                {
                    adductsThisFormula = new ArrayList<Adduct>();
                    allFormulaAdductsMap.put(formula, adductsThisFormula);
                }
                adductsThisFormula.add(adduct);
            }
        }

        //todo: this is expensive but useful.  Need to put this back in if possible
        if (!chemicalModifications.isEmpty())
        {
            _log.debug("******NOT CHECKING FOR MODS WITH DUPE STRUCTURE!!!! TOO EXPENSIVE!! PUT BACK IN IF POSSIBLE****");

//            _log.debug("Checking for duplicate SMILES strings...");
//            //Remove modified duplicate adducts
//            for (List<Adduct> adductsForAFormula : allFormulaAdductsMap.values())
//                removeModifiedWhenDuplicateSMILES(adductsForAFormula);
        }

        if (!chemicalModifications.isEmpty())
        {
            ApplicationContext.infoMessage("Modifications and numbers of compounds applied to (out of " +
                    compounds.size() + "):");
            for (ChemicalModification mod : chemicalModifications)
            {
                int numComps = 0;
                if (modCompoundsMap.containsKey(mod))
                    numComps = modCompoundsMap.get(mod).size();
                ApplicationContext.infoMessage(mod.getName() + ": " + numComps);
            }
        }
        _log.debug("Created adducts for all compounds");
        return allFormulaAdductsMap;
    }

    /**
     * Remove any modified Adducts for which an unmodified Adduct exists with the same SMILES string
     * @param adducts
     */
    public void removeModifiedWhenDuplicateSMILES(List<Adduct> adducts)
    {
        List<String> unmodifiedSMILESStrings = new ArrayList<String>();
        List<Adduct> modifiedAdducts = new ArrayList<Adduct>();
        for (Adduct adduct : adducts)
        {
            if (adduct.getModifications() == null || adduct.getModifications().isEmpty())
                 unmodifiedSMILESStrings.add(ChemCalcs.createSMILESString(adduct.getMolecule()));
            else
                modifiedAdducts.add(adduct);
        }
        for (Adduct adduct : modifiedAdducts)
        {
            if (unmodifiedSMILESStrings.contains(ChemCalcs.createSMILESString(adduct.getMolecule())))
                adducts.remove(adduct);
        }
    }

    /**
     * This one uses the default formula-adducts map
     * @param featuresSortedMassAsc
     * @param massTolPPM
     * @param numPeaksMustMatch
     * @return
     */
    public Map<Feature, Map<ChemicalFormula, List<Adduct>>> massMatchFull(Feature[] featuresSortedMassAsc,
                                                              float massTolPPM, int numPeaksMustMatch)
    {
        return massMatchFull(featuresSortedMassAsc, getOrCreateFormulaAdductsMap(), massTolPPM, numPeaksMustMatch);
    }



    /**
     * This version takes a map from formulas to adducts
     * @param featuresSortedMassAsc
     * @param massTolPPM
     * @param numPeaksMustMatch at least numPeaksMustMatch peaks ALL must match within tolerance
     * @return
     */
    public Map<Feature, Map<ChemicalFormula, List<Adduct>>> massMatchFull(Feature[] featuresSortedMassAsc,
                                                              Map<ChemicalFormula, List<Adduct>> formulaAdductsMap,
                                                              float massTolPPM, int numPeaksMustMatch)
    {
        //make absolutely sure feature masses are fixed.
        for (Feature feature : featuresSortedMassAsc)
            feature.setMass(feature.getMz() * feature.getCharge());
        List<ChemicalFormula> formulasByMassAsc = new ArrayList<ChemicalFormula>(allFormulaAdductsMap.keySet());
        Collections.sort(formulasByMassAsc, new ChemicalFormula.ComparatorMassAsc());
        _log.debug("Unique chemical formulas in database, with adducts: " + formulasByMassAsc.size());
        if (_log.isDebugEnabled())
        {
            List<Integer> adductCountsByFormula = new ArrayList<Integer>();
            for (List<Adduct> formulas : formulaAdductsMap.values())
                adductCountsByFormula.add(formulas.size());
            _log.debug("Mean adducts per chemical formula: " + BasicStatistics.mean(adductCountsByFormula));
//            if (showCharts)
//                new PanelWithHistogram(adductCountsByFormula, "# of Adducts Per Formula").displayInTab();
        }

        int minPossibleMassIndex = 0;
        Map<Feature, Map<ChemicalFormula, List<Adduct>>> result =
                new HashMap<Feature, Map<ChemicalFormula, List<Adduct>>>();
        boolean newFeature = true;
        for (Feature feature :featuresSortedMassAsc)
        {
            //not using calculated feature mass, because that presumes M+H and subtracts the H
            float featureMass = feature.getMz() * feature.getCharge();
            float massToleranceDa = MassUtilities.calculateAbsoluteDeltaMass(featureMass, massTolPPM,
                            FeatureSetMatcher.DELTA_MASS_TYPE_PPM);
            float minMatchMass = featureMass - massToleranceDa;
            float maxMatchMass = featureMass + massToleranceDa;

//            while ((formulasByMassAsc.get(minPossibleMassIndex).getCommonestIsotopeMass() + 1.0078125) < minMatchMass &&
            while (formulasByMassAsc.get(minPossibleMassIndex).getCommonestIsotopeMass() < minMatchMass &&
                   minPossibleMassIndex < formulasByMassAsc.size()-1)
            {
                minPossibleMassIndex++;
            }

            List<ChemicalFormula> formulasMatchedThisFeature = new ArrayList<ChemicalFormula>();
            for (int i=minPossibleMassIndex ; i< formulasByMassAsc.size(); i++)
            {
                ChemicalFormula formula = formulasByMassAsc.get(i);

                float compoundMass = (float) formula.getCommonestIsotopeMass();

                if (compoundMass < minMatchMass)
                    continue;
                else if (maxMatchMass < compoundMass)
                    break;
                else
                {
                    if (newFeature)
                    {
                        newFeature = false;
                    }
                    if (numPeaksMustMatch > 1)
                    {
                        float minPeaksFeatureOrCompound = Math.min(feature.comprised.length,
                                formula.getPeakMasses().length);
                        if (minPeaksFeatureOrCompound < numPeaksMustMatch)
                            continue;
                        int numMatched = 1;
                        for (int peakInd = 1; peakInd < minPeaksFeatureOrCompound; peakInd++)
                        {
                            float peakMass = Feature.convertMzToMass(feature.comprised[peakInd].mz, feature.charge);
                            float deltaMassPPM =
                                    MassUtilities.convertDaToPPM((float) formula.getPeakMasses()[peakInd] - peakMass,
                                                                 peakMass);
                            if (Math.abs(deltaMassPPM) > massTolPPM)
                            {
                                numMatched++;
                            }
                        }
                        if (numMatched >= numPeaksMustMatch)
                            formulasMatchedThisFeature.add(formula);
                    }
                    else
                        formulasMatchedThisFeature.add(formula);
                }
            }


            if (!formulasMatchedThisFeature.isEmpty())
            {
                Map<ChemicalFormula, List<Adduct>> resultThisFeature = new HashMap<ChemicalFormula, List<Adduct>>();
                for (ChemicalFormula formula : formulasMatchedThisFeature)
                    resultThisFeature.put(formula, formulaAdductsMap.get(formula));
                result.put(feature, resultThisFeature);
            }
        }
        return result;
    }

    /**
     *
     * @param featureSet
     * @param looseMassTolerancePPM
     * @return
     * @throws IOException
     */
    public double calcRunMassError(FeatureSet featureSet, float looseMassTolerancePPM) throws IOException
    {
        List<ChemicalCompound> compoundsList = new ArrayList<ChemicalCompound>();
        List<ChemicalCompound> compoundsWith2PeaksList = new ArrayList<ChemicalCompound>();

//        List<Float> dbAllMassPeakDistances = new ArrayList<Float>();
//        List<Float> dbAllMassPeakDistancesPPMDist = new ArrayList<Float>();
//        List<Float> dbAllMasses = new ArrayList<Float>();
        for (ChemicalCompound compound : databaseCompounds)
        {
            try
            {
                compoundsList.add(compound);
                if (compound.getPeakMasses().length == 1)
                {
                    _log.debug("Single-peak compound " + compound);
                }
                else
                {
                    float peakDist = (float)(compound.getPeakMasses()[1] - compound.getPeakMasses()[0]);
                    if (Math.abs(peakDist) < 1.5f)
                    {
//                        dbAllMasses.add((float) compound.getMass());
//                        dbAllMassPeakDistances.add(peakDist);
//                        if (Math.abs(MassUtilities.convertDaToPPM(peakDist - 1.0035f, (float) compound.getMass())) < 50)
//                            dbAllMassPeakDistancesPPMDist.add(MassUtilities.convertDaToPPM(peakDist - 1.0035f, (float) compound.getMass()));
                        compoundsWith2PeaksList.add(compound);
                    }
                    else
                    {
                        _log.debug("Weird second-peak mass: " + compound);
                    }
                }
            }
            catch (IllegalArgumentException e)
            {
                ApplicationContext.setMessage("Skipping bad compound: " + e.getMessage());
            }

        }

//        if (showCharts)
//        {
//            new PanelWithHistogram(dbAllMassPeakDistances, "db peak distances").displayInTab();
//            new PanelWithHistogram(dbAllMassPeakDistancesPPMDist, "db peak spread ppm").displayInTab();
//            new PanelWithScatterPlot(dbAllMasses, dbAllMassPeakDistances, "db mass vs peakdist").displayInTab();
//        }


        Collections.sort(compoundsWith2PeaksList, new ChemicalCompound.ComparatorMassAsc());
        Collections.sort(compoundsList, new ChemicalCompound.ComparatorMassAsc());


        Feature[] features = featureSet.getFeatures();
        List<Feature> featuresWithMultiplePeaksList = new ArrayList<Feature>();
        for (Feature feature : features)
            if (feature.peaks > 1)
                 featuresWithMultiplePeaksList.add(feature);
        Feature[] featuresWithMultiplePeaks = featuresWithMultiplePeaksList.toArray(new Feature[featuresWithMultiplePeaksList.size()]);

        ApplicationContext.infoMessage("Processing file " + featureSet.getSourceFile().getName() + " with " + features.length +
                " features");
        Arrays.sort(featuresWithMultiplePeaks, new Feature.MassAscComparator());

        //disabling, because this is useless
        if (showCharts && false)
        {
            List<Float> featureMasses = new ArrayList<Float>();
            List<Float> featurePeakDistances = new ArrayList<Float>();
            for (Feature feature : featuresWithMultiplePeaks)
            {
                float featureMass = feature.getMz() * feature.getCharge();
                featureMasses.add(featureMass);
                featurePeakDistances.add(feature.comprised[1].mz - feature.comprised[0].mz);
            }
            new PanelWithScatterPlot(featureMasses, featurePeakDistances, "feature_mass_peakdist").displayInTab();
        }


        Map<Feature, Map<ChemicalFormula, List<Adduct>>> featureMassMatchMap =
                massMatchBestOnly(featuresWithMultiplePeaks, createFormulaAdductsMap(compoundsWith2PeaksList),
                        looseMassTolerancePPM);
        List<Float> deltaMassPPMList = new ArrayList<Float>();
        List<Float> deltaMassPPMSecondPeakList = new ArrayList<Float>();
        List<Float> compoundMassesFor2PeakDiffList = new ArrayList<Float>();
        List<Float> peakDeltaMassDaDifferencesList = new ArrayList<Float>();
        List<Float> peakDeltaMassPPMDifferencesList = new ArrayList<Float>();
        List<Float> secondPeakDeltaMassPPMList = new ArrayList<Float>();


        for (Feature feature : featureMassMatchMap.keySet())
        {
            float featureMass = feature.getMz() * feature.getCharge();
            Map<ChemicalFormula, List<Adduct>> thisFeatureMatches = featureMassMatchMap.get(feature);
            for (ChemicalFormula formula : thisFeatureMatches.keySet())
            {
                double compoundMass = formula.getCommonestIsotopeMass();
                float massCompoundSecondPeak = (float) formula.getPeakMasses()[1];

//if (MassUtilities.convertDaToPPM(Math.abs((float) mass - trackedMass), trackedMass) < 1)
// System.err.println("MATCHED feature mass " + feature.getMass() + " to compound " + compound);
                float deltaMassPPM = MassUtilities.convertDaToPPM(featureMass - (float) compoundMass, (float) compoundMass);
                deltaMassPPMList.add(deltaMassPPM);

                float featureSecondPeakMass = feature.comprised[1].mz / feature.charge;
//System.err.println(mass + ", " + massCompoundSecondPeak + ", " + featureSecondPeakMass);
                float deltaMassSecondPeakPPM =
                        MassUtilities.convertDaToPPM(featureSecondPeakMass - massCompoundSecondPeak,
                                massCompoundSecondPeak);
                deltaMassPPMSecondPeakList.add(deltaMassSecondPeakPPM);
//System.err.println("****" + deltaMassPPM + ", " + deltaMassSecondPeakPPM);
                //todo: 30 is arbitrary
                if (Math.abs(deltaMassSecondPeakPPM - deltaMassPPM) < 30)  {
                    compoundMassesFor2PeakDiffList.add((float) compoundMass);
                    peakDeltaMassPPMDifferencesList.add(deltaMassSecondPeakPPM - deltaMassPPM);
                    peakDeltaMassDaDifferencesList.add(featureSecondPeakMass - featureMass);
                    secondPeakDeltaMassPPMList.add(deltaMassSecondPeakPPM);
                }
            }
        }
        System.err.println("Median mass difference between first and second peak: " + BasicStatistics.median(peakDeltaMassDaDifferencesList));


        if (peakDeltaMassPPMDifferencesList.size() < 10)
            throw new IOException("Not enough 2-peak matches (" + 
                    peakDeltaMassPPMDifferencesList.size() + ") to analyze distribution");

        if (showCharts)
        {
            new PanelWithHistogram(secondPeakDeltaMassPPMList, "peak2_deltappm").displayInTab();            
            new PanelWithHistogram(peakDeltaMassPPMDifferencesList, "2peak_diff").displayInTab();
            new PanelWithScatterPlot(compoundMassesFor2PeakDiffList, peakDeltaMassPPMDifferencesList, "mass_vs_2peak_diff",
                    "Compound Mass", "2-peak delta").displayInTab();            
        }

        ApplicationContext.setMessage("Calculating mixed distribution...");
        Collections.sort(deltaMassPPMList);
        Pair<float[], float[]> paramsAndProbs =
                calculateDistParamsAndProbsEM(peakDeltaMassPPMDifferencesList, DEFAULT_INITIAL_MATCH_PROPORTION);
        float[] distParams = paramsAndProbs.first;
        float[] probabilities = paramsAndProbs.second;

        //todo: implicitly assuming mu==0.  Should be true after calibration.  But I don't think this is actually true
        float mu = distParams[0];
        float sigma = distParams[1];
        float proportion = distParams[2];
        float area = distParams[3];

        float ppmContainingMostPoints = sigmaMultipleForErrorDist * sigma;
        ApplicationContext.setMessage("Done calculating mixed distribution.  sigma=" + sigma +
                ".  Most points within " + ppmContainingMostPoints + "ppm");
        ApplicationContext.setMessage("Mu: " + mu);
        float massErrorPPM = (float) (ppmContainingMostPoints / Math.sqrt(2));
        ApplicationContext.setMessage("Run mass error: " + massErrorPPM);
//        float parity = 0.5f;
//
//        int densParityIndex = Math.abs(Arrays.binarySearch(probabilities, parity));
//        float ppmAtParity = deltaMassPPMList.get(densParityIndex);
//        _log.debug("Optimizing ppm tolerance, staring with " + ppmAtParity + "ppm");
//        double closestProbability = Float.MAX_VALUE;
//        float increment = 0.2f;
//        while (ppmAtParity > 0)
//        {
//            double proportionTimesNormdist = proportion * BasicStatistics.calcNormalCumDensity(mu, sigma, ppmAtParity);
//            double probability = proportionTimesNormdist / (proportionTimesNormdist + ((1-proportion) / area) );
//System.err.println("PPM: " + ppmAtParity + ", prob: " + probability);
//            if (densParityIndex == 0 || probability >= closestProbability)
//                break;
//            ppmAtParity -= increment;
//        }


//        float[] fdrs = AmtMatchProbabilityAssigner.calculateFDRsForProbabilities(probabilities);
//        new PanelWithHistogram(fdrs, "FDRs").displayInTab();

        if (showCharts)
        {
            List<Float> deltaMassesKeptList = new ArrayList<Float>();
            List<Float> deltaMassPPMSecondPeakKeptList = new ArrayList<Float>();
            List<Float> deltaMassesSecondUnderList = new ArrayList<Float>();
            List<Float> deltaMassPPMSecondPeakSecondUnderList = new ArrayList<Float>();
            for (int i=0; i<deltaMassPPMList.size(); i++)
            {
                if (Math.abs(deltaMassPPMSecondPeakList.get(i)) < massErrorPPM)
                {
                    deltaMassesSecondUnderList.add(deltaMassPPMList.get(i));
                    deltaMassPPMSecondPeakSecondUnderList.add(deltaMassPPMSecondPeakList.get(i));
                    if (Math.abs(deltaMassPPMList.get(i)) < massErrorPPM)
                    {
                        deltaMassesKeptList.add(deltaMassPPMList.get(i));
                        deltaMassPPMSecondPeakKeptList.add(deltaMassPPMSecondPeakList.get(i));                                  
                    }
                }
            }
            PanelWithScatterPlot pwspTwoPeakError =
                    new PanelWithScatterPlot(deltaMassesKeptList, deltaMassPPMSecondPeakKeptList, "2peakerror");
            pwspTwoPeakError.addData(deltaMassesSecondUnderList, deltaMassPPMSecondPeakSecondUnderList, "peak 2 under");
            pwspTwoPeakError.addData(deltaMassPPMList, deltaMassPPMSecondPeakList, "all points");
            pwspTwoPeakError.setAxisLabels("Peak 1 Delta Mass", "Peak 2 Delta Mass");
            pwspTwoPeakError.displayInTab();
            PanelWithHistogram pwhDiffs = new PanelWithHistogram(peakDeltaMassPPMDifferencesList, "peak_dist_diff");
            pwhDiffs.displayInTab();
            float[] peakDeltaMassPPMDifferencesArray = new float[peakDeltaMassPPMDifferencesList.size()];
            for (int i=0; i<peakDeltaMassPPMDifferencesArray.length; i++)
                peakDeltaMassPPMDifferencesArray[i] = peakDeltaMassPPMDifferencesList.get(i);
            PanelWithScatterPlot deltaMassProbPlot = new PanelWithScatterPlot(peakDeltaMassPPMDifferencesArray,
                    probabilities, "PeakDeltaMassVsProb");
            deltaMassProbPlot.setAxisLabels("Peak Pair DeltaMass (PPM)","Probability Matching Same Ion");
            deltaMassProbPlot.displayInTab();
        }

        return massErrorPPM;
    }




    /**
      * Use the Expectation Maximization algorithm to determine the parameters of the target normal distribution
     * and the proportion of datapoints in that distribution vs. the uniform background.
      * @param targetMassErrorDataList
      * @param proportionTrue
      * @return a float[] containing mu, sigma, and proportion in the normal dist
      * @throws IOException
      */
     public Pair<float[],float[]> calculateDistParamsAndProbsEM(List<Float> targetMassErrorDataList,
                                          float proportionTrue)
             throws IOException
     {
         //uniquify masses, so duplicate masses don't mess up distribution
         Map<Integer, List<Integer>> uniqueOrigIndexesMap = new HashMap<Integer, List<Integer>>();
         List<Float> uniqueMassesList = new ArrayList<Float>();

         for (int i=0; i<targetMassErrorDataList.size(); i++)
         {
             Float mass = targetMassErrorDataList.get(i);
             int uniqueIndex = uniqueMassesList.indexOf(mass);
             if (uniqueIndex == -1)
             {
                 uniqueMassesList.add(mass);
                 uniqueIndex = uniqueMassesList.size()-1;
             }

             List<Integer> indexes = uniqueOrigIndexesMap.get(uniqueIndex);
             if (indexes == null)
             {
                 indexes = new ArrayList<Integer>();
                 uniqueOrigIndexesMap.put(uniqueIndex, indexes);
             }
             indexes.add(i);
         }

         if (uniqueMassesList.size() < 10)
            throw new IllegalArgumentException("calculateDistParamsAndProbsEM: Too few mass matches (" +
                    uniqueMassesList.size() + ") to calculate parameters");

         double[] uniqueMasses = new double[uniqueOrigIndexesMap.size()];
         for (int i=0; i<uniqueMasses.length; i++)
            uniqueMasses[i] = uniqueMassesList.get(i);


         int numPoints = uniqueMasses.length;



         Map<String,Object> rScalarVarMap = new HashMap<String,Object>();
         Map<String,double[]> rVectorVarMap = new HashMap<String,double[]>();

         rVectorVarMap.put("targetx",uniqueMasses);

         _log.debug("calculateDistParamsAndProbsEM, targetx: " + uniqueMasses.length);


         rScalarVarMap.put("proportion",(double) proportionTrue);
         double area = BasicStatistics.max(uniqueMasses) - BasicStatistics.min(uniqueMasses);
         rScalarVarMap.put("area",area);
         rScalarVarMap.put("miniterations",(double) 30);
         rScalarVarMap.put("maxiterations",(double) 80);
         rScalarVarMap.put("max_deltap_proportion_for_stable",(double) .005f);
         rScalarVarMap.put("max_deltap_proportion",(double) .005f + 0.001);

         rScalarVarMap.put("iters_stable_for_converg",(double) 1);

         rScalarVarMap.put("showcharts",showCharts ? "TRUE" : "FALSE");
         File outChartFile = null;
         File normTestOutChartFile = null;

         if (showCharts)
         {
             outChartFile = TempFileManager.createTempFile("em_plots.jpg", this);
             normTestOutChartFile = TempFileManager.createTempFile("em_normtest_plots.jpg", this);

             rScalarVarMap.put("chart_out_file", "'" + RInterface.generateRFriendlyPath(outChartFile) + "'");
             rScalarVarMap.put("normtest_chart_out_file", "'" +
                     RInterface.generateRFriendlyPath(normTestOutChartFile) + "'");

         }

         String calculateProbabilitiesCommand =
           RInterface.readResourceFile("/org/fhcrc/cpl/viewer/metabologna/assign_massmatch_probabilities_em.R");

         //this timeout is arbitrary and dangerous.  Woo hoo!
         int timeoutMilliseconds=900000;
         String rResult = RInterface.evaluateRExpression(calculateProbabilitiesCommand,
                 rScalarVarMap, rVectorVarMap, null, null, timeoutMilliseconds);

         Map<String, String> varResultStringMap =
                 RInterface.extractVariableStringsFromListOutput(rResult);
         float[] errorProbabilities = new float[numPoints];

         int probsIndex = 0;
         String[] probChunks = varResultStringMap.get("probs").split("\\s");
         for (String probChunk : probChunks)
         {
             if (!probChunk.contains("[") && probChunk.length() > 0)
             {
                 errorProbabilities[probsIndex++] = Float.parseFloat(probChunk);
             }
         }
         if (probsIndex != numPoints)
             throw new IOException("FAILED to read probabilities correctly back from R!");

         boolean converged = varResultStringMap.get("converged").contains("TRUE");
         String numIterString = varResultStringMap.get("num_iterations");
         numIterString = numIterString.substring(numIterString.indexOf("]") + 1).trim();
         int num_iterations = Integer.parseInt(numIterString);
         if (converged)
         {

             ApplicationContext.setMessage("EM estimation converged after " +
                     num_iterations + " iterations");
         }
         else
         {
             ApplicationContext.infoMessage("WARNING!!! EM estimation failed to converge after " +
                     num_iterations + " iterations");
         }

         String[] ksChunks = varResultStringMap.get("ksresults").split("\\s");
         float ks_score_x = Float.parseFloat(ksChunks[2]);

         //todo: move parsing of vector results into RInterface
         String[] distParamValues = varResultStringMap.get("dist_params").split("\\s");
         List<Float> paramVals = new ArrayList<Float>();
         for (String paramVal : distParamValues)
             if (paramVal != null && paramVal.length() > 1 && !paramVal.contains("["))
                 paramVals.add(Float.parseFloat(paramVal));

         float mu_x = paramVals.get(0);
         float sigma_x = paramVals.get(1);
         float proportion = paramVals.get(2);


         _log.debug("Distribution params: mu_=" + mu_x+
                    ", sigma=" + sigma_x + ", proportion=" + proportion);


         if (ks_score_x < 0.005f )
         {
             _log.debug("WARNING!!!!  Kolmogorov-Smirnov test indicates that these matching data " +
                 "may not conform to a bivariate normal distribution intermingled with a uniform " +
                 "distribution.  If this key assumption fails, match probabilities will not be " +
                 "accurate.  Please consider re-running this analysis in non-parametric mode. " +
                 "KS p-value = " + ks_score_x);
         }
         else
         {
             ApplicationContext.setMessage("KS normality test passed.  KS value = " + ks_score_x );
         }

         if (showCharts)
         {
             try
             {
                 PanelWithBlindImageChart pwbic =
                         new PanelWithBlindImageChart(outChartFile,"EM Parameters");
                 pwbic.displayInTab();

             }
             catch (Exception e)
             {
                 ApplicationContext.errorMessage("Error displaying error cutoff chart images, file: "
                               + outChartFile.getAbsolutePath(),e);
             }
             try
             {
                 PanelWithBlindImageChart pwbic =
                         new PanelWithBlindImageChart(normTestOutChartFile,"EM dist analysis");
                 pwbic.displayInTab();
             }
             catch (Exception e)
             {
                 ApplicationContext.infoMessage("Error displaying error cutoff chart images, file: "
                               + normTestOutChartFile.getAbsolutePath() + ", error: " + e.getMessage());
             }
         }
         TempFileManager.deleteTempFiles(this);

         float[] distParams = new float[] { mu_x, sigma_x, proportion, (float) area };

         float[] matchProbabilities = new float[targetMassErrorDataList.size()];
         for (int i=0; i<errorProbabilities.length; i++)
         {
             List<Integer> indexes = uniqueOrigIndexesMap.get(i);
             for (int matchIndex : indexes)
             {
                matchProbabilities[matchIndex] = errorProbabilities[i];
             }
         }

         return new Pair<float[], float[]>(distParams, matchProbabilities);
     }



    public float getCalMassTolerancePPM()
    {
        return calMassTolerancePPM;
    }

    public void setCalMassTolerancePPM(float calMassTolerancePPM)
    {
        this.calMassTolerancePPM = calMassTolerancePPM;
    }

    public float getCalMaxLeverage()
    {
        return calMaxLeverage;
    }

    public void setCalMaxLeverage(float calMaxLeverage)
    {
        this.calMaxLeverage = calMaxLeverage;
    }

    public double getCalMaxStudRes()
    {
        return calMaxStudRes;
    }

    public void setCalMaxStudRes(double calMaxStudRes)
    {
        this.calMaxStudRes = calMaxStudRes;
    }

    public List<ChemicalCompound> getDatabaseCompounds()
    {
        return databaseCompounds;
    }

    public void setDatabaseCompounds(List<ChemicalCompound> databaseCompoundsByMass)
    {
        this.databaseCompounds = databaseCompoundsByMass;
    }

    public boolean isShowCharts()
    {
        return showCharts;
    }

    public void setShowCharts(boolean showCharts)
    {
        this.showCharts = showCharts;
    }

    public int getCalRegressionDoF() {
        return calRegressionDoF;
    }

    public void setCalRegressionDoF(int calRegressionDoF) {
        this.calRegressionDoF = calRegressionDoF;
    }

    public List<ChemicalModification> getChemicalModifications() {
        return chemicalModifications;
    }

    public void setChemicalModifications(List<ChemicalModification> chemicalModifications) {
        this.chemicalModifications = chemicalModifications;
    }

    public boolean isUseUnmodifiedAdduct() {
        return useUnmodifiedAdduct;
    }

    public void setUseUnmodifiedAdduct(boolean useUnmodifiedAdduct) {
        this.useUnmodifiedAdduct = useUnmodifiedAdduct;
    }
}