AmtDatabaseBuilder.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.fhcrc.cpl.viewer.amt;

import java.util.*;
import java.io.File;

import org.apache.log4j.Logger;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet;
import org.fhcrc.cpl.toolbox.proteomics.feature.matching.FeatureSetMatcher;
import org.fhcrc.cpl.toolbox.proteomics.feature.matching.BaseFeatureSetMatcherImpl;
import org.fhcrc.cpl.toolbox.proteomics.feature.matching.Window2DFeatureSetMatcher;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.AmtExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.feature.filehandler.PepXMLFeatureFileHandler;
import org.fhcrc.cpl.toolbox.proteomics.MSRun;
import org.fhcrc.cpl.toolbox.proteomics.ProteomicsRegressionUtilities;
import org.fhcrc.cpl.toolbox.commandline.CommandLineModuleUtilities;
import org.fhcrc.cpl.toolbox.ApplicationContext;
import org.fhcrc.cpl.toolbox.statistics.BasicStatistics;
import org.fhcrc.cpl.toolbox.datastructure.Pair;
import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities;
import org.fhcrc.cpl.toolbox.proteomics.MS2Modification;
import org.fhcrc.cpl.toolbox.proteomics.filehandler.PepXmlLoader;

/**
 * Builds AMT databases, in various ways
 *
 * If you want to build an AMT database in some other way, put a method in here to do it
 * your way.
 */
public class AmtDatabaseBuilder
{
    static Logger _log = Logger.getLogger(AmtDatabaseBuilder.class);

    //default cutoff values for tossing out observations with high studentized residuals
    //Marty suggested 2.3 for this
    public static final double DEFAULT_MAX_STUDENTIZED_RESIDUAL_FOR_REGRESSION = 2.0;
    //Marty suggested 3.0 for this
    public static final double DEFAULT_MAX_STUDENTIZED_RESIDUAL_FOR_INCLUSION = 2.0;

    //default parameters for median alignment
    public static final int DEFAULT_MIN_PEPTIDES_FOR_ALIGNMENT_REGRESSION=8;
    public static final int DEFAULT_MIN_OBSERVATIONS_FOR_ALIGNMENT_REGRESSION=3;

    //Default maximum leverage numerator (X/n) to keep for regression
    public static final double DEFAULT_MAX_LEVERAGE_NUMERATOR = 4;

    public static final float DEFAULT_MS1_MS2_MASS_TOLERANCE_PPM = 10;
    public static final float DEFAULT_MS1_MS2_TIME_TOLERANCE_SECONDS = 25;

    protected float ms1Ms2MassTolerancePPM = DEFAULT_MS1_MS2_MASS_TOLERANCE_PPM;
    protected float ms1Ms2TimeToleranceSeconds = DEFAULT_MS1_MS2_TIME_TOLERANCE_SECONDS;

    //If this is true, ignore any unknown modifications -- put the peptide in as though the
    //modification didn't exist
    public static final boolean DEFAULT_IGNORE_UNKNOWN_MODIFICATIONS = false;
    protected boolean ignoreUnknownModifications = DEFAULT_IGNORE_UNKNOWN_MODIFICATIONS;

    /**
     * Create the default N-Terminal modifications that are used by X!Tandem.
     * In later X!Tandem versions, these will be declared explicitly.
     * If they're not, need to add them.
     * @return
     */
    public static MS2Modification[] createDefaultNTerminalModifications()
    {
        MS2Modification eMod = new MS2Modification();
        eMod.setAminoAcid("E");
        eMod.setMassDiff(-18.0106f);
        eMod.setVariable(true);

        MS2Modification qMod = new MS2Modification();
        qMod.setAminoAcid("Q");
        qMod.setMassDiff(-17.0265f);
        qMod.setVariable(true);

        return new MS2Modification[]
                {
                        eMod,
                        qMod
                };
    }

    /**
     * calculate observed hydrophobicity for all features and
     * add them all as observations to a new database.
     *
     * This one's the workhorse.  Everything else calls it eventually
     *
     * @param scanOrTimeMode
     * @return
     */
    public static AmtDatabase createAmtDatabaseForRun(FeatureSet featureSet,
                                                      int scanOrTimeMode,
                                                      double[] timeToHCoefficients,
                                                      boolean calculateHydrophobicities,
                                                      Map<String, Integer> spectralCountsMap,
                                                      boolean ignoreUnknownModifications)
    {

        //build a map of known offsets from the features
        Feature[] features = featureSet.getFeatures();

        if (calculateHydrophobicities)
            AmtUtilities.addHydrophobicityToFeatures(
                        featureSet.getFeatures(), scanOrTimeMode,
                        timeToHCoefficients);

        //Now that we've got hydrophobicity on the features, create a db

        AmtDatabase result = new AmtDatabase();
        AmtRunEntry runEntry = new AmtRunEntry(timeToHCoefficients,
                                               null);

        MS2Modification[] runModifications = MS2ExtraInfoDef.getFeatureSetModifications(featureSet);


        //check for explicitly-declared E and Q modifications.  If they're not declared, add them
        boolean hasEorQMod = false;
        if (runModifications != null)
        {
            for (MS2Modification mod : runModifications)
            {
                String aa = mod.getAminoAcid();
                if ("E".equals(aa) || "Q".equals(aa))
                {
                    hasEorQMod=true;
                    break;
                }
            }
        }
        if (!hasEorQMod)
        {
            _log.debug("Adding default E and Q modifications, not specified explicitly in file");

            int existingModsLength=0;
            if (runModifications != null)
                existingModsLength = runModifications.length;
            MS2Modification[] defaultEQMods = createDefaultNTerminalModifications();
            MS2Modification[] newModifications =
                    new MS2Modification[existingModsLength + defaultEQMods.length];
            if (runModifications != null)
                System.arraycopy(runModifications, 0, newModifications, 0, existingModsLength);
            System.arraycopy(defaultEQMods, 0, newModifications, existingModsLength, defaultEQMods.length);
            runModifications = newModifications;
        }

        runEntry.setModifications(runModifications);
        if (_log.isDebugEnabled())
        {
            if (MS2ExtraInfoDef.getFeatureSetModifications(featureSet) != null)
                for (MS2Modification mod : runModifications)
                    _log.debug("Adding modification from run: " + mod);
        }
        File sourceFile = featureSet.getSourceFile();
        if (sourceFile != null)
            runEntry.setPepXmlFilename(featureSet.getSourceFile().getName());
        result.addRunEntry(runEntry);

        for (Feature feature : features)
        {
            String peptideSequence = MS2ExtraInfoDef.getFirstPeptide(feature);
            if (peptideSequence.contains("X"))
            {
                _log.debug("Skipping peptide sequence with aminoacid 'X'.  Peptide: " +
                        peptideSequence);
            }
            else
            {
                //first try to resolve the modifications.  If that works, fine.  If not, then either
                //bomb out, or throw a warning, depending on ignoreUnknownModifications
                List<MS2Modification>[] ms2Modifications =
                        (List<MS2Modification>[]) new List[peptideSequence.length()];
                try
                {
                    ms2Modifications = result.resolveMods(peptideSequence,
                                                        MS2ExtraInfoDef.getModifiedAminoAcids(feature),
                                                        runEntry);
                }
                catch (IllegalArgumentException e)
                {
                    if (ignoreUnknownModifications)
                    {
                        ApplicationContext.infoMessage("WARNING: could not resolve all peptide modifications: " +
                                e.getMessage());
                    }
                    else
                    {
                        throw e;
                    }
                }
                result.addObservation(peptideSequence, ms2Modifications, MS2ExtraInfoDef.getPeptideProphet(feature),
                                      AmtExtraInfoDef.getObservedHydrophobicity(feature), runEntry,
                                      spectralCountsMap, feature.getTime());


            }
        }
        return result;
    }

    protected static Map<String, Double> performInitialRegression(Feature[] allFeatures,
                                                                  int scanOrTimeMode)
    {
        Feature[] featuresForFirstRegression =
                ProteomicsRegressionUtilities.selectFeaturesWithLowLeverage(allFeatures,
                        DEFAULT_MAX_LEVERAGE_NUMERATOR, scanOrTimeMode);

        //Do an initial regression
        return AmtUtilities.calculateScanOrTimeHydrophobicityRelationship(
                        featuresForFirstRegression,
                        scanOrTimeMode,
                        false);
    }

    /**
     * Calculate "studentized residuals" for each feature.
     *
     * This is based on the residual of the feature, and on its leverage. It's a measure
     * of the certainty that the feature is messed up w.r.t. the rest of the features.
     *
     * @param allFeatures
     * the residual squared must be in order to be dropped from the set
     * @return
     */
    public static double[] calculateStudentizedResiduals(Feature[] allFeatures,
                                                         int scanOrTimeMode)
    {
        //number of features
        int n = allFeatures.length;

        if (n==0)
            throw new IllegalArgumentException("ERROR! No features!");

        if (scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_TIME)
        {
            boolean nonzeroTimesExist = false;
            for (Feature feature : allFeatures)
            {
                if (feature.getTime() > 0)
                {
                    nonzeroTimesExist = true;
                    break;
                }
            }
            if (!nonzeroTimesExist)
            {
                throw new IllegalArgumentException("ERROR! Feature retention times are all zero!  " +
                        "Please populate feature " +
                        "times (e.g., using the 'populatems2times' command), or provide associated mzXML file(s) to " +
                        "use for populating scan times.");
            }
        }

        //Do an initial regression using all features
        Map<String, Double> scanOrTimeToHydroLine =
                performInitialRegression(allFeatures, scanOrTimeMode);
        //residuals based on the initial regression (y - yhat)
        double[] hydrophobicityResiduals = new double[n];
        //scan/time values for all features (x)
        double[] scanOrTimeValues = new double[n];

        //gather the information we need about each feature
        for (int i=0; i<n; i++)
        {
            //record all x values
            scanOrTimeValues[i] = ((scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_TIME) ?
                            allFeatures[i].getTime() :
                            allFeatures[i].getScan());
            //this value was already implicitly calculated above, when regression was
            //performed, but we don't have easy access to it.  Anyway, fast.
            //This is yhat
            double predictedH =
                    RegressionUtilities.predictYFromX(
                            AmtUtilities.getSlopeFromRegressionLine(scanOrTimeToHydroLine),
                            AmtUtilities.getInterceptFromRegressionLine(scanOrTimeToHydroLine),
                            scanOrTimeValues[i]);
            //calculated H: y
            double calculatedH =
                             AmtUtilities.calculateNormalizedHydrophobicity(
                                     MS2ExtraInfoDef.getFirstPeptide(allFeatures[i]));

            //residual is y - yhat
            hydrophobicityResiduals[i] = calculatedH - predictedH;
        }

        //sigmaHydroError requires a denomenator of n-2, so can't use
        //BasicStatistics.standardDeviation
        double sumSquaredResiduals=0;
        for (double residual : hydrophobicityResiduals)
            sumSquaredResiduals += Math.pow(residual, 2);
        double sigmaHydroError = Math.sqrt(sumSquaredResiduals / (n - 2));

        //calculate the leverages of all values
        double[] leverages = BasicStatistics.leverages(scanOrTimeValues);

        //todo: switch this over to BasicStatistics.studentizedResiduals().
        //Not doing it now because right before 2.0 release
        double[] studentizedResiduals = new double[n];

        //1 + 1/n.  Same for all inputs, compute once
        double onePlusOneOverN = 1 + 1/n;

        //Find the "studentized residual" of every feature:
        for (int i=0; i < n; i++)
        {
            double scaledResidual = hydrophobicityResiduals[i] / sigmaHydroError;
            studentizedResiduals[i] = scaledResidual /
                    Math.sqrt(onePlusOneOverN + leverages[i]);
        }
        _log.debug("studentized residual mean: " + BasicStatistics.mean(studentizedResiduals) + ", stddev: " + BasicStatistics.standardDeviation(studentizedResiduals));

        return studentizedResiduals;
    }

    /**
     * Given an array of features, choose just the ones with residual <=
     * of maxStudentizedResidual.  This is done twice, so it's a method.
     * @param allFeatures
     * @param studentizedResiduals
     * @param maxStudentizedResidual
     * @return
     */
    public static Feature[]
            chooseFeaturesWithMaxStudentizedResidual(Feature[] allFeatures,
                                                     double[] studentizedResiduals,
                                                     double maxStudentizedResidual)
    {
        //square the max studentized residual, compare with square of studentized
        //residual (since studentized residual is signed)
        double maxStudentizedResidualSquared = Math.pow(maxStudentizedResidual,2);
        List<Feature> passingFeatures = new ArrayList<Feature>();
        for (int i=0; i<allFeatures.length; i++)
        {
            if (Math.pow(studentizedResiduals[i],2) <= maxStudentizedResidualSquared)
                passingFeatures.add(allFeatures[i]);
        }
        return passingFeatures.toArray(new Feature[0]);
    }

    /**
     * Count the spectra with each peptide identification.  Assumes filtering
     * has already been done such that these features are the ones we consider good
     * @param features
     * @return
     */
    protected static Map<String,Integer> countSpectraForPeptides(Feature[] features)
    {
        Map<String, Integer> result = new HashMap<String, Integer>();

        for (Feature feature : features)
        {
            String peptide = MS2ExtraInfoDef.getFirstPeptide(feature);
            if (peptide == null)
                continue;
            if (result.containsKey(peptide))
                result.put(peptide, result.get(peptide) + 1);
            else
                result.put(peptide, 1);
        }

        return result;
    }

    /**
     * Create an AMT database for a single run, doing those arcane things we do
     * to cast out the unworthy features.  Here's the sequence:
     * 1.  Calculate the leverage of all features
     * 2.  Toss out all features with leverage > 4/n, perform a regression to
     * map time to normalized H (first cut)
     * 3.  Calculate the studentized residual of all features from the original set
     * 4.  Take out features with studentized residual < 2.0, perform a regression
     * to map time to normalized H (final version)
     * 5.  Take features with studentized residual < 2.0, insert those peptides
     * into the database 
     *
     *
     * @param ms2FeatureSet
     * @param ms1FeatureSet
     * @param scanOrTimeMode
     * @param robustRegression
     * @param outNumFeaturesChosen
     * @param maxStudentizedResidualForRegression
     * @param maxStudentizedResidualForInclusion
     * @return
     */
    public AmtDatabase createAmtDatabaseForRun(FeatureSet ms2FeatureSet,
                                                      FeatureSet ms1FeatureSet,
                                                      int scanOrTimeMode,
                                                      boolean robustRegression,
                                                      Pair<Integer, Integer> outNumFeaturesChosen,
                                                      double maxStudentizedResidualForRegression,
                                                      double maxStudentizedResidualForInclusion)
    {
        Map<String, Integer> peptideSpectralCountsMap =
                countSpectraForPeptides(ms2FeatureSet.getFeatures());


        FeatureSet featureSetForProcessing = null;
        if (ms1FeatureSet != null)
        {
            keepOnlySinglyMatchedMS1Features(ms2FeatureSet, ms1FeatureSet);
            featureSetForProcessing = ms1FeatureSet;
        }
        else
        {
            AmtDatabaseMatcher.representPeptidesWithMedianTimePerPeptidePerMod(ms2FeatureSet);
            featureSetForProcessing = ms2FeatureSet;
        }

        if (featureSetForProcessing.getFeatures().length == 0)
        {
            ApplicationContext.infoMessage("\tNo features to add to database! Skipping.");
            return null;
        }
        ApplicationContext.infoMessage("\tFeatures to add to database: " + featureSetForProcessing.getFeatures().length);

        Feature[] allProcessingFeatures = featureSetForProcessing.getFeatures();
        double[] studentizedResiduals =
                calculateStudentizedResiduals(allProcessingFeatures,
                                              scanOrTimeMode);

        Feature[] featuresForRegression =
                chooseFeaturesWithMaxStudentizedResidual(allProcessingFeatures,
                        studentizedResiduals,
                        maxStudentizedResidualForRegression);

        Map<String, Double> timeHydroLine =
                AmtUtilities.calculateScanOrTimeHydrophobicityRelationship(
                        featuresForRegression,
                        scanOrTimeMode,
                        robustRegression);
        double[] timeToHCoefficients = new double[2];
        timeToHCoefficients[1] = AmtUtilities.getSlopeFromRegressionLine(timeHydroLine);
        timeToHCoefficients[0] = AmtUtilities.getInterceptFromRegressionLine(timeHydroLine);


        if (AmtUtilities.getSlopeFromRegressionLine(timeHydroLine) < 0)
                ApplicationContext.infoMessage("WARNING: negative regression line slope!");

        //the features for inclusion in the database may be chosen based on a different
        //cutoff from those chosen for regression.
        //TODO: Should we be recalculating the studentized residuals based on the prediction line
        //we calculated?
        Feature[] featuresForInclusion =
                chooseFeaturesWithMaxStudentizedResidual(allProcessingFeatures,
                        studentizedResiduals,
                        maxStudentizedResidualForInclusion);

        outNumFeaturesChosen.first = featuresForRegression.length;
        outNumFeaturesChosen.second = featuresForInclusion.length;
        _log.debug("Chose " + featuresForRegression.length + "/" + allProcessingFeatures.length +
                " features for regression");
        _log.debug("Chose " + featuresForInclusion.length + "/" + allProcessingFeatures.length +
                " features for inclusion");

        FeatureSet featureSetForInclusion = new FeatureSet(featuresForInclusion);
        featureSetForInclusion.setSourceFile(featureSetForProcessing.getSourceFile());
        MS2Modification[] modifications = MS2ExtraInfoDef.getFeatureSetModifications(featureSetForProcessing);
        _log.debug("Setting modifications: " + modifications);
        MS2ExtraInfoDef.setFeatureSetModifications(featureSetForInclusion,
                modifications);

        AmtDatabase amtDatabase =
                createAmtDatabaseForRun(featureSetForInclusion, scanOrTimeMode,
                        timeToHCoefficients,
                        true, peptideSpectralCountsMap, ignoreUnknownModifications);
        if (_log.isDebugEnabled())
        {
            int numTotalObservations = 0;
            for (AmtPeptideEntry peptideEntry : amtDatabase.getEntries())
                numTotalObservations += peptideEntry.getNumObservations();
            _log.debug("Number of total observations: " + numTotalObservations);
        }
        return amtDatabase;
    }




    /**
     * Given a directory full of pepXml files and a directory full of the
     * corresponding mzXML files, create an AMT database from all the pepXml files,
     * using the mzXML files to grab retention times.
     *
     * Highly dependent on naming conventions.  If you're starting with an all.pep.xml
     * file, use the method below instead.
     *
     * The nice thing about this is that it can check for existence of all the
     * mzXml files before starting work.
     * @param pepXmlDir
     * @param mzXmlDir
     * @param scanOrTimeMode
     * @return
     * @throws Exception
     */
    public AmtDatabase createAmtDBFromDirectories(File pepXmlDir, File ms1Dir, File mzXmlDir,
                                                         int scanOrTimeMode,
                                                         FeatureSet.FeatureSelector featureSelector,
                                                         boolean robustRegression,
                                                         double maxStudentizedResidualForRegression,
                                                         double maxStudentizedResidualForInclusion,
                                                         boolean align)
            throws Exception
    {
        boolean getTimesFromMzxml = (mzXmlDir != null);
        
        AmtDatabase resultDB = new AmtDatabase();
        List<File> ms2FeatureFiles = new ArrayList<File>();
        Map<File,File> pepXmlMzXmlFileMap = new HashMap<File,File>();

        String[] potentialFiles = pepXmlDir.list();
        Arrays.sort(potentialFiles);
        for (String pepXmlFileName : potentialFiles)
        {
            File ms2FeatureFile =
                    new File(pepXmlDir.getAbsolutePath() + File.separatorChar +
                             pepXmlFileName);
            if (ms2FeatureFile.exists() && !ms2FeatureFile.isDirectory())
                ms2FeatureFiles.add(ms2FeatureFile);
        }
        if (getTimesFromMzxml)
        {
            for (File pepXmlFile : ms2FeatureFiles)
            {
                File mzXmlFile = CommandLineModuleUtilities.findFileLikeFile(pepXmlFile, mzXmlDir,"mzXML");
                pepXmlMzXmlFileMap.put(pepXmlFile, mzXmlFile);
            }
        }

        Map<File,File> pepXmlMS1FileMap = new HashMap<File,File>();

        if (ms1Dir != null)
        {
            for (File pepXmlFile : ms2FeatureFiles)
            {
                File ms1File = CommandLineModuleUtilities.findFileLikeFile(pepXmlFile, ms1Dir, "");
                pepXmlMS1FileMap.put(pepXmlFile, ms1File);
            }
        }

        //for each pair of pepXml and mzXml files, create an amt database
        //and add its observations, runs and mods to the cumulative db
        int numFeaturesChosenForRegression = 0;
        int numFeaturesChosenForInclusion = 0;

        Pair<Integer,Integer> numFeaturesDummy = new Pair<Integer,Integer>(0,0);

        for (File ms2FeatureFile : ms2FeatureFiles)
        {
            ApplicationContext.infoMessage("Adding features from file " + ms2FeatureFile.getAbsolutePath());

            FeatureSet ms2FeatureSet = new FeatureSet(ms2FeatureFile);
            ms2FeatureSet = ms2FeatureSet.filter(featureSelector);
            if (getTimesFromMzxml)
            {
                File mzXmlFile = pepXmlMzXmlFileMap.get(ms2FeatureFile);
                _log.debug("Populating times for MS2 file " + ms2FeatureFile + " from mzXML file " + mzXmlFile);
                MSRun thisRun = MSRun.load(mzXmlFile.getAbsolutePath());
                ms2FeatureSet.populateTimesForMS2Features(thisRun);
            }

            FeatureSet ms1FeatureSet = null;
            if (pepXmlMS1FileMap.get(ms2FeatureFile) != null)
            {
                File ms1File = pepXmlMS1FileMap.get(ms2FeatureFile);
                ms1FeatureSet = new FeatureSet(ms1File);
                ApplicationContext.infoMessage("\tUsing " + ms1FeatureSet.getFeatures().length +
                        " MS1 features from file " + ms1File.getAbsolutePath());

            }


            if (align)
            {
                addRunToAmtDatabase(resultDB, ms2FeatureSet, ms1FeatureSet,
                        scanOrTimeMode, robustRegression, maxStudentizedResidualForRegression,
                        maxStudentizedResidualForInclusion, 2, .05, 10);
            }
            else
            {
                AmtDatabase thisRunDB =
                        createAmtDatabaseForRun(ms2FeatureSet, ms1FeatureSet,
                                scanOrTimeMode, robustRegression,
                                numFeaturesDummy, maxStudentizedResidualForRegression,
                                maxStudentizedResidualForInclusion);
                if (thisRunDB != null)
                {
                    numFeaturesChosenForRegression += numFeaturesDummy.first;
                    numFeaturesChosenForInclusion += numFeaturesDummy.second;

                    resultDB.addObservationsFromAnotherDatabase(thisRunDB);
                    _log.debug("Features chosen for regression: " + numFeaturesChosenForRegression);
                    ApplicationContext.infoMessage("\tPeptides from this run: " + thisRunDB.numEntries() +
                            ", running count: " + resultDB.numEntries());
                }
            }
        }

        return resultDB;
    }

    /**
     *
     * @param ms2FeatureSet
     * @param ms1FeatureSet
     */
    protected void keepOnlySinglyMatchedMS1Features(FeatureSet ms2FeatureSet, FeatureSet ms1FeatureSet)
    {
        int numTotalMS1Features=ms1FeatureSet.getFeatures().length;

        Window2DFeatureSetMatcher featureSetMatcher =
                new Window2DFeatureSetMatcher();
        featureSetMatcher.setMassDiffType(FeatureSetMatcher.DELTA_MASS_TYPE_PPM);
        featureSetMatcher.setMaxMassDiff(ms1Ms2MassTolerancePPM);
        featureSetMatcher.setMinMassDiff(-ms1Ms2MassTolerancePPM);
        featureSetMatcher.setMaxElutionDiff(ms1Ms2TimeToleranceSeconds);
        featureSetMatcher.setMinElutionDiff(-ms1Ms2TimeToleranceSeconds);
        featureSetMatcher.setElutionMode(BaseFeatureSetMatcherImpl.ELUTION_MODE_TIME);
        featureSetMatcher.setMatchWithinChargeOnly(true);

        FeatureSetMatcher.FeatureMatchingResult ms1MS2MatchingResult =
                featureSetMatcher.matchFeatures(ms1FeatureSet, ms2FeatureSet);

        if (_log.isDebugEnabled())
        {
            List<Float> deltaMasses = new ArrayList<Float>();
            List<Float> deltaTimes = new ArrayList<Float>();
            for (Feature feature : ms1MS2MatchingResult.getMasterSetFeatures())
            {
                List<Feature> ms2MatchedFeatures = ms1MS2MatchingResult.getSlaveSetFeatures(feature);
                for (Feature ms2Feature : ms2MatchedFeatures)
                {
                    deltaMasses.add((ms2Feature.getMass() - feature.getMass()) * 1000000 / ms2Feature.getMass());
                    deltaTimes.add(ms2Feature.getTime() - feature.getTime());
                }
            }
            float deltaMassStdDev = BasicStatistics.standardDeviationFloatList(deltaMasses);
            float deltaTimeStdDev = BasicStatistics.standardDeviationFloatList(deltaTimes);


//PanelWithHistogram pwh2 = new PanelWithHistogram(deltaTimes, "deltaTime");
//pwh2.displayDialog("deltaTime");
            _log.debug("Matching MS1 to MS2: 3 standard deviations: mass=" + (3 * deltaMassStdDev) +
                    ", time=" + (3*deltaTimeStdDev));
        }

        List<Feature> singlyMatchedMS1Features = new ArrayList<Feature>();
        for (Feature feature : ms1MS2MatchingResult.getMasterSetFeatures())
        {
            List<Feature> ms2MatchedFeatures = ms1MS2MatchingResult.getSlaveSetFeatures(feature);
            Set<String> peptides = new HashSet<String>();
            for (Feature ms2MatchedFeature : ms2MatchedFeatures)
                peptides.add(MS2ExtraInfoDef.getFirstPeptide(ms2MatchedFeature));
            if (peptides.size() == 1)
            {
                Feature ms2MatchedFeature = ms2MatchedFeatures.get(0);
                MS2ExtraInfoDef.setSinglePeptide(feature, peptides.iterator().next());
                MS2ExtraInfoDef.setModifiedAminoAcids(feature, MS2ExtraInfoDef.getModifiedAminoAcids(ms2MatchedFeature));
                MS2ExtraInfoDef.setPeptideProphet(feature, MS2ExtraInfoDef.getPeptideProphet(ms2MatchedFeature));
                singlyMatchedMS1Features.add(feature);
            }
        }
        ms1FeatureSet.setFeatures(singlyMatchedMS1Features.toArray(new Feature[singlyMatchedMS1Features.size()]));
        //set featureset modifications
        MS2ExtraInfoDef.setFeatureSetModifications(ms1FeatureSet,
                MS2ExtraInfoDef.getFeatureSetModifications(ms2FeatureSet));
        _log.debug("MS2 features: " + ms2FeatureSet.getFeatures().length + ", MS1 features: " +
                numTotalMS1Features + ", total matched: " + ms1MS2MatchingResult.size() +
                ", singly matched: " + singlyMatchedMS1Features.size() );

    }

    /**
     * workinprogress
     *
     * Add a run to an AMT database.
     *
     * After linear H calculation, if we have significant agreeing peptide overlap (at least
     * minMatchedPeptides that match database entries with at least minDBObservationsForAlignmentMatch
     * observations, within maxHDiffForAlignmentMatch H units) then use those agreeing
     * peptides as points for alignment, and set all the rest of the H values according
     * to that alignment. 
     *
     * @param amtDatabase
     * @param newRunFeatureSet
     * @param scanOrTimeMode
     * @param robustRegression
     * @param maxStudentizedResidualForRegression
     * @param maxStudentizedResidualForInclusion
     * @param maxHDiffForAlignmentMatch
     * @param minMatchedPeptidesForAlignment
     */
    public void addRunToAmtDatabase(AmtDatabase amtDatabase, FeatureSet newRunFeatureSet,
                                           FeatureSet ms1FeatureSet,
                                           int scanOrTimeMode,
                                           boolean robustRegression,
                                           double maxStudentizedResidualForRegression,
                                           double maxStudentizedResidualForInclusion,
                                           double minDBObservationsForAlignmentMatch,
                                           double maxHDiffForAlignmentMatch,
                                           int minMatchedPeptidesForAlignment)
    {
        Pair<Integer, Integer> outNumFeaturesChosen = new Pair<Integer, Integer>(0,0);
        AmtDatabase thisRunDB = createAmtDatabaseForRun(
                newRunFeatureSet,
                ms1FeatureSet,
                scanOrTimeMode,
                robustRegression,
                outNumFeaturesChosen,
                maxStudentizedResidualForRegression,
                maxStudentizedResidualForInclusion);
        amtDatabase.addObservationsFromAnotherDatabase(thisRunDB);
    }

    /**
     * Create an AMT database from an all.pep.xml file -- pass null
     * to the workhorse method so that it knows we need to look in the file
     * from a reference to the mzXML file
     * @param allPepXmlFile
     * @param scanOrTimeMode
     * @param featureSelector
     * @param robustRegression
     * @return
     * @throws Exception
     */
    public AmtDatabase createAmtDBFromAllPepXml(File allPepXmlFile,
                                                       File ms1Dir,
                                                       int scanOrTimeMode,
                                                       FeatureSet.FeatureSelector featureSelector,
                                                       boolean robustRegression,
                                                       double maxStudentizedResidualForRegression,
                                                       double maxStudentizedResidualForInclusion)
            throws Exception
    {
        return createAmtDBFromPepXml(allPepXmlFile, ms1Dir, null, scanOrTimeMode,
                                     featureSelector, robustRegression,
                                     maxStudentizedResidualForRegression,
                                     maxStudentizedResidualForInclusion);
    }

    /**
     * Create an AMT database from a pep.xml file.
     *
     * If no mzXml file is specified, look in the pepXml fractions themselves
     * to find mzXML files
     *
     * Warning:
     * Since PepXmlLoader runs through the pepXml file sequentially, we have to
     * load the databases one at a time... so if an mzXml file is missing
     * at the very end, we won't find it until we've done most of the work.
     *
     * TODO: currently this will fail if you're on an OS that uses a different
     * file separator character from the one the all.pep.xml file was created on,
     * because fraction.getSpectrumPath() will have the wrong characters.  Should be converted
     * @param allPepXmlFile
     * @param scanOrTimeMode
     * @return
     * @throws Exception
     */
    public AmtDatabase createAmtDBFromPepXml(File allPepXmlFile,
                                                    File ms1Dir,
                                                    MSRun run,
                                                    int scanOrTimeMode,
                                                    FeatureSet.FeatureSelector featureSelector,
                                                    boolean robustRegression,
                                                    double maxStudentizedResidualForRegression,
                                                    double maxStudentizedResidualForInclusion)
            throws Exception
    {
        AmtDatabase resultDB = new AmtDatabase();
        PepXmlLoader pepXmlLoader = new PepXmlLoader(allPepXmlFile, _log);

        PepXmlLoader.FractionIterator fractionIterator =
                pepXmlLoader.getFractionIterator();
        File pepXmlFileDir = allPepXmlFile.getParentFile();

        boolean shouldLoadRuns =
                (run == null && scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_TIME);

        int numFeaturesChosenForRegression=0;
        int numFeaturesChosenForInclusion=0;
        Pair<Integer,Integer> numFeaturesDummy = new Pair<Integer,Integer>(0,0);
        int numFeaturesEvaluated = 0;
        //for each fraction, create a featureset and locate the mzXml file,
        //create an amt database, and add all the observations, runs and mods
        PepXMLFeatureFileHandler pepXmlFileHandler = new PepXMLFeatureFileHandler();
        while (fractionIterator.hasNext())
        {
            PepXmlLoader.PepXmlFraction fraction =
                    (PepXmlLoader.PepXmlFraction) fractionIterator.next();
            _log.debug("fraction " + fraction.getDataBasename());
            FeatureSet fractionFeatureSet =
                    pepXmlFileHandler.createFeatureSetFromPepXMLFraction(fraction, pepXmlLoader);

            fractionFeatureSet = fractionFeatureSet.filter(featureSelector);
            //remove all but the first occurrence of each peptide

            MS2ExtraInfoDef.setFeatureSetModifications(fractionFeatureSet,
                    fraction.getModifications().toArray(new MS2Modification[0]));
            if (shouldLoadRuns)
            {
                File mzXmlFile = new File(pepXmlFileDir.getAbsolutePath() +
                        File.separatorChar +
                        fraction.getSpectrumPath());
                _log.debug("mzxml file: " + mzXmlFile.getAbsolutePath());

                run = MSRun.load(mzXmlFile.getAbsolutePath());
            }

            if (scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_TIME)
                fractionFeatureSet.populateTimesForMS2Features(run);

            numFeaturesEvaluated += fractionFeatureSet.getFeatures().length;

            FeatureSet ms1FeatureSet = null;
            if (ms1Dir != null)
            {
                File ms1File = CommandLineModuleUtilities.findFileWithPrefix(fraction.getDataBasename(), ms1Dir,"tsv");
                ms1FeatureSet = new FeatureSet(ms1File);
            }

            AmtDatabase thisRunDB =
                    createAmtDatabaseForRun(fractionFeatureSet,ms1FeatureSet,
                            scanOrTimeMode, robustRegression, numFeaturesDummy,
                            maxStudentizedResidualForRegression,
                            maxStudentizedResidualForInclusion);
            _log.debug("Created fraction database: " + thisRunDB);

            numFeaturesChosenForRegression += numFeaturesDummy.first;
            numFeaturesChosenForInclusion += numFeaturesDummy.second;

            resultDB.addObservationsFromAnotherDatabase(thisRunDB);
            thisRunDB.getRunBySequence(1).setMzXmlFilename(run.getFileName());
            thisRunDB.getRunBySequence(1).setPepXmlFilename(fraction.getDataBasename() + ".pep.xml");
        }
        _log.debug("Features chosen for final regression: " + numFeaturesChosenForRegression);
        _log.debug("Features chosen for inclusion: " + numFeaturesChosenForInclusion);
        _log.debug("Total features evaluated: " + numFeaturesEvaluated);
        _log.debug("Resulting database: " + resultDB);

        return resultDB;
    }






    /**
     * Create a feature set based on this AMT database, mapped to a particular run.
     * Make a BUNCH of assumptions.
     * @param run
     * @param regressionMS2FeatureSet
     * @return
     */
    public static FeatureSet buildFeatureSet(AmtDatabase amtDB,
                                             MSRun run, FeatureSet regressionMS2FeatureSet,
                                             int chargeForAllFeatures, double minPeptideProphet,
                                             boolean robustRegression)
    {
        ArrayList<Feature> resultFeaturesList = new ArrayList<Feature>();

        FeatureSet.FeatureSelector sel = new FeatureSet.FeatureSelector();
        sel.setMinPProphet((float)minPeptideProphet);

        regressionMS2FeatureSet = regressionMS2FeatureSet.filter(sel);

        regressionMS2FeatureSet.populateTimesForMS2Features(run);

        Map<String,Double> regressionMap =
            AmtUtilities.calculateHydrophobicityScanOrTimeRelationship(
                    regressionMS2FeatureSet.getFeatures(),
                    ProteomicsRegressionUtilities.REGRESSION_MODE_TIME,
                    robustRegression);

        double slope = regressionMap.get(RegressionUtilities.REGRESSION_SLOPE_KEY);
        double intercept = regressionMap.get(RegressionUtilities.REGRESSION_INTERCEPT_KEY);

        for (AmtPeptideEntry peptideEntry : amtDB.getEntries())
        {
            int scan = (int) AmtUtilities.predictScanOrTime(slope, intercept,
                                             peptideEntry.getMedianObservedHydrophobicity());
            Feature newFeature =
                    MS2ExtraInfoDef.createMS2Feature(scan, (float) peptideEntry.getMass(), chargeForAllFeatures,
                                                     peptideEntry.getPeptideSequence(), null, null);
            AmtExtraInfoDef.setObservedHydrophobicity(newFeature,peptideEntry.getMedianObservedHydrophobicity());

            //fake support for our fake charge
            if (chargeForAllFeatures>1)
                newFeature.setPeaks(2);

            //fake intensity
            newFeature.setIntensity(200);

            resultFeaturesList.add(newFeature);
        }

        return new FeatureSet(resultFeaturesList.toArray(new Feature[0]));
    }

    public float getMs1Ms2MassTolerancePPM()
    {
        return ms1Ms2MassTolerancePPM;
    }

    public void setMs1Ms2MassTolerancePPM(float ms1Ms2MassTolerancePPM)
    {
        this.ms1Ms2MassTolerancePPM = ms1Ms2MassTolerancePPM;
    }

    public float getMs1Ms2TimeToleranceSeconds()
    {
        return ms1Ms2TimeToleranceSeconds;
    }

    public void setMs1Ms2TimeToleranceSeconds(float ms1Ms2TimeToleranceSeconds)
    {
        this.ms1Ms2TimeToleranceSeconds = ms1Ms2TimeToleranceSeconds;
    }

    public boolean isIgnoreUnknownModifications()
    {
        return ignoreUnknownModifications;
    }

    public void setIgnoreUnknownModifications(boolean ignoreUnknownModifications)
    {
        this.ignoreUnknownModifications = ignoreUnknownModifications;
    }
}