AmtDatabaseManager.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.fhcrc.cpl.viewer.amt;

import java.util.*;
import org.apache.log4j.Logger;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet;
import org.fhcrc.cpl.toolbox.proteomics.feature.matching.FeatureSetMatcher;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.AmtExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef;
import org.fhcrc.cpl.viewer.ms2.Fractionation2DUtilities;
import org.fhcrc.cpl.toolbox.*;
import org.fhcrc.cpl.toolbox.statistics.BasicStatistics;
import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities;
import org.fhcrc.cpl.toolbox.datastructure.Pair;
import org.fhcrc.cpl.toolbox.gui.chart.*;
import org.fhcrc.cpl.toolbox.proteomics.MS2Modification;

/**
 * Manages AMT databases -- prunes outliers of various kinds
 *
 */
public class AmtDatabaseManager
{
    static Logger _log = Logger.getLogger(AmtDatabaseManager.class);

    //default parameters for median alignment
    public static final int DEFAULT_MIN_PEPTIDES_FOR_ALIGNMENT_REGRESSION=30;
    public static final int DEFAULT_MIN_OBSERVATIONS_FOR_ALIGNMENT_REGRESSION=3;

    //parameters for database self-alignment
    public static final int DEFAULT_MATCHING_DEGREE_FOR_DB_ALIGNMENT = AmtDatabaseMatcher.DEFAULT_NONLINEAR_MAPPING_DEGREE;


    /**
     * Remove entries with only one observation, where that observation differs from
     * predicted H by more than significantHydroDifference
     * @param amtDB
     * @param significantHydroDifference
     */
    public static void removePredictedHOutliers(AmtDatabase amtDB,
                                                float significantHydroDifference)
    {
        int excludedEntryCount = 0;
        for (AmtPeptideEntry peptideEntry : amtDB.getEntries())
        {
            //if there's only one observation for this peptide entry, and we're conditionally
            //excluding single-observation entries...
            if (peptideEntry.getNumObservations() < 2)
            {
                //if the single obseration is too far off from the predicted hydrophobicity...
                if (Math.abs(peptideEntry.getPredictedHydrophobicity() -
                        peptideEntry.getMedianObservedHydrophobicity()) >
                        significantHydroDifference)
                {
                    amtDB.removeEntry(peptideEntry.getPeptideSequence());
                    excludedEntryCount++;
                }
            }
        }
        ApplicationContext.infoMessage("Removed " + excludedEntryCount +
                " entries with suspicious observed H values");
    }

    /**
     * Adjust AMT database entries to account for the effect of acrylamide
     * @param amtDB Altered in place
     * @param fromAcrylamideToNot If true, adjust acrylamide-containing entries to remove acrylamide's effect.
     * If false, adjust non-acrylamide-containing entries to add the effect of acrylamide
     * @return
     */
    public static AmtDatabase adjustEntriesForAcrylamide(AmtDatabase amtDB,
                                                         boolean fromAcrylamideToNot,
                                                         boolean showCharts)
    {
        List<Double> oldHVals = new ArrayList<Double>();
        List<Double> newHVals = new ArrayList<Double>();

        for (AmtPeptideEntry peptideEntry : amtDB.getEntries())
        {
            for (AmtPeptideEntry.AmtPeptideObservation obs : peptideEntry.getObservations())
            {
                double oldHVal = obs.getObservedHydrophobicity();
                double newHydrophobicity;
                if (fromAcrylamideToNot)
                    newHydrophobicity = AcrylamideHydrophobicityAdjuster.removeAcrylamideEffect(
                            peptideEntry.getPeptideSequence(), oldHVal);
                else
                    newHydrophobicity = AcrylamideHydrophobicityAdjuster.adjustHForAcrylamide(
                            peptideEntry.getPeptideSequence(), oldHVal);
                obs.setObservedHydrophobicity(newHydrophobicity);
                oldHVals.add(oldHVal);
                newHVals.add(newHydrophobicity);
            }
            peptideEntry.recalculateStats();
        }

        if (showCharts)
        {
            ScatterPlotDialog spd = new ScatterPlotDialog(oldHVals, newHVals, "New vs. Old hydrohpobicity values");
            spd.setVisible(true);
        }                                      

        return amtDB;
    }


    protected static Set<String> intersection(Set<String> overlapSet1, Set<String> overlapSet2)
    {
        Set<String> intersSet = new HashSet<String>(overlapSet1);
        intersSet.retainAll(overlapSet2);
        return intersSet;
    }

    /**
     *
     **
     * @param amtDB
     * @param minMatchedPeptides
     */
    public static AmtDatabase alignAllRunsUsingCommonPeptides(AmtDatabase amtDB,
                                           int minMatchedPeptides, int matchingdegree,
                                           float maxLeverageNumerator, float maxStudentizedResidual, boolean showCharts)
    {

        Map<AmtRunEntry, Set<String>> peptidesInRuns =
                new HashMap<AmtRunEntry, Set<String>>(amtDB.numRuns());

        int numAllRuns = amtDB.numRuns();

        ApplicationContext.infoMessage("Choosing best runs to start with...");

        for (AmtRunEntry run : amtDB.getRuns())
        {
            Set<String> peptidesThisRun = new HashSet<String>();
            for (AmtPeptideEntry peptideEntry1 : amtDB.getPeptideEntriesForRun(run))
                peptidesThisRun.add(peptideEntry1.getPeptideSequence());
            peptidesInRuns.put(run, peptidesThisRun);
        }

        Pair<AmtRunEntry, AmtRunEntry> closestRuns = null;
        int numMatchedBetweenClosest = 0;
        Set<String> closestPeptideOverlap = null;
        for (AmtRunEntry run1 : amtDB.getRuns())
        {
            Set<String> run1Peptides = peptidesInRuns.get(run1);
            for (AmtRunEntry run2 : amtDB.getRuns())
            {
                Set<String> run2Peptides = peptidesInRuns.get(run2);

                //if it can't possibly be the best one, skip it
                if (run2Peptides.size() < numMatchedBetweenClosest)
                    continue;

                Set<String> matchedPeptides = intersection(run1Peptides, run2Peptides);
                if (matchedPeptides.size() > numMatchedBetweenClosest)
                {
                    closestPeptideOverlap = matchedPeptides;
                    numMatchedBetweenClosest = matchedPeptides.size();
                    closestRuns = new Pair<AmtRunEntry, AmtRunEntry>(run1, run2);
                }
            }
        }

        if (numMatchedBetweenClosest < minMatchedPeptides)
            throw new RuntimeException("No two runs had enough in common to align");
        ApplicationContext.infoMessage("Adding initial runs...");

        AmtDatabase result = new AmtDatabase();
        AmtRunEntry baseRun = closestRuns.first;
        addRun(result, amtDB, baseRun);


        Map<AmtRunEntry, double[]> runTHMappingCoefficientMap =
                new HashMap<AmtRunEntry, double[]>();

        runTHMappingCoefficientMap.put(closestRuns.second,
                alignRun(result, amtDB, closestRuns.second, closestPeptideOverlap, matchingdegree,
                        maxLeverageNumerator, maxStudentizedResidual, showCharts));
        addRun(result, amtDB, closestRuns.second);


        Set<String> peptidesAddedToDatabase = new HashSet<String>();
        peptidesAddedToDatabase.addAll(peptidesInRuns.get(baseRun));
        peptidesAddedToDatabase.addAll(peptidesInRuns.get(closestRuns.second));

        Collection<AmtRunEntry> runsToAlign = new ArrayList<AmtRunEntry>();
        for (AmtRunEntry run : amtDB.getRuns())
        {
            if (run != baseRun && run != closestRuns.second)
                runsToAlign.add(run);
        }

        ApplicationContext.infoMessage("Adding the rest of the runs...");

        //keep track of the peptides already in the database that also exist in all runs
        Map<AmtRunEntry, Set<String>> databasePeptidesInRuns =
                new HashMap<AmtRunEntry, Set<String>>(amtDB.numRuns());
        for (AmtRunEntry runEntry : runsToAlign)
            databasePeptidesInRuns.put(runEntry, intersection(peptidesAddedToDatabase, peptidesInRuns.get(runEntry)));

        PanelWithChart runningChartPanel = null;
        while (runsToAlign.size() > 0)
        {
            ApplicationContext.setMessage("Runs remaining: " + runsToAlign.size() +
                    " (" + Rounder.round(100.0 * (float)(numAllRuns-runsToAlign.size())/(float)numAllRuns, 2) +
                    "% complete)");

            AmtRunEntry nextRun = null;
            int maxOverlapSize = -1;
            for (AmtRunEntry possibleNextRun : runsToAlign)
            {
                Set<String> possibleNextRunOverlap = databasePeptidesInRuns.get(possibleNextRun);
                possibleNextRunOverlap.addAll(intersection(peptidesAddedToDatabase, peptidesInRuns.get(possibleNextRun)));

                if (possibleNextRunOverlap.size() > maxOverlapSize)
                {
                    maxOverlapSize = possibleNextRunOverlap.size();
                    nextRun = possibleNextRun;
                }
            }

            if (maxOverlapSize < minMatchedPeptides)
            {
                ApplicationContext.infoMessage("Unable to align remaining " + runsToAlign.size() + " runs, excluding them");
                break;
            }

            runTHMappingCoefficientMap.put(nextRun,
                    alignRun(result, amtDB, nextRun, databasePeptidesInRuns.get(nextRun), matchingdegree,
                            maxLeverageNumerator, maxStudentizedResidual, showCharts));
            addRun(result, amtDB, nextRun);
            runsToAlign.remove(nextRun);
            
            //cleanup
            databasePeptidesInRuns.remove(nextRun);


        }

        if (showCharts)
        {
            double maxTimeAnyRun = 0;
            for (AmtRunEntry run : amtDB.getRuns())
            {
                for (AmtPeptideEntry.AmtPeptideObservation obs : amtDB.getObservationsForRun(run))
                    if (obs.getTimeInRun() > maxTimeAnyRun)
                        maxTimeAnyRun = obs.getTimeInRun();
            }

            PanelWithScatterPlot pwsp = new PanelWithScatterPlot();
            pwsp.setName("Realignment of DB runs");
            for (double[] thMapCoefficients : runTHMappingCoefficientMap.values())
            {
                int numDotsOnChart = ((int) maxTimeAnyRun+1) /2;
                double[] mappingXVals = new double[numDotsOnChart];
                double[] mappingYVals = new double[numDotsOnChart];
                for (int j=0; j<numDotsOnChart; j++)
                {
                    mappingXVals[j] = 2 * j;
                    mappingYVals[j] = RegressionUtilities.mapValueUsingCoefficients(thMapCoefficients, j);
                }
                pwsp.addData(mappingXVals, mappingYVals, "");
            }

//            spd.addData(reverseRegressionLineXVals, reverseRegressionLineYVals, "reverse regression line");
//            spd.addData(regressionLineXVals, regressionLineYVals, "regression line");
//            spd.addData(lowStudResMs1Times, lowStudResMs2H, "low studentized residual");

            pwsp.setAxisLabels("MS1 time","AMT Hydrophobicity");
            pwsp.displayInTab();
        }

        return result;
    }

    protected static double[] alignRun(AmtDatabase newDatabase, AmtDatabase sourceDatabase,
                          AmtRunEntry runToAdd, Set<String> peptideOverlap, int matchingDegree,
                          float maxLeverageNumerator, float maxStudentizedResidual, boolean showCharts)
    {
        List<Pair<Feature, Feature>> peptideMatches = new ArrayList<Pair<Feature, Feature>>();

        for (String commonPeptide : peptideOverlap)
        {
            AmtPeptideEntry newDatabaseEntry = newDatabase.getEntry(commonPeptide);
            AmtPeptideEntry.AmtPeptideObservation runToAddObservation =
                    sourceDatabase.getEntry(commonPeptide).getObservationForRun(runToAdd);
            Feature newDatabaseFeature = new Feature();
            AmtExtraInfoDef.setObservedHydrophobicity(newDatabaseFeature,
                    newDatabaseEntry.getMedianObservedHydrophobicity());
            Feature runToAddFeature = new Feature();
            runToAddFeature.setTime((float) runToAddObservation.getTimeInRun());
            peptideMatches.add(new Pair<Feature,Feature>(runToAddFeature, newDatabaseFeature));
        }
        AmtDatabaseMatcher matcher = new AmtDatabaseMatcher();
        matcher.setMaxRegressionStudentizedResidual(maxStudentizedResidual);
        matcher.setMaxRegressionLeverageNumerator(maxLeverageNumerator);
        matcher.setBuildCharts(true);

        float maxTimeForMap = 0;
        for (AmtPeptideEntry.AmtPeptideObservation obs : sourceDatabase.getObservationsForRun(runToAdd))
        {
            if (obs.getTimeInRun() > maxTimeForMap)
                maxTimeForMap = (float) obs.getTimeInRun() + 1;
        }
        double[] timeHydMapCoefficients = matcher.calculateTHMapCoefficientsWithMatchedFeatures(
                peptideMatches.toArray((Pair<Feature,Feature>[]) new Pair[peptideMatches.size()]),
                matchingDegree);
        runToAdd.setTimeHydMapCoefficients(timeHydMapCoefficients);
        for (AmtPeptideEntry.AmtPeptideObservation newRunObs : sourceDatabase.getObservationsForRun(runToAdd))
        {
            double newHydrophobicity =
                    RegressionUtilities.mapValueUsingCoefficients(timeHydMapCoefficients,
                            newRunObs.getTimeInRun());
            newRunObs.setObservedHydrophobicity(newHydrophobicity);
        }

        if (showCharts)
        {
            if (MultiChartDisplayPanel.getSingletonInstance().getNumCharts() > 0)
                MultiChartDisplayPanel.getSingletonInstance().removeAllCharts();
            PanelWithChart pwc = new PanelWithChart(matcher.getTimeHydrophobicityMappingChart());
            pwc.setName("Alignment for run " + newDatabase.numRuns());
            pwc.displayInTab();
        }

        return timeHydMapCoefficients;
    }



    protected static void addRun(AmtDatabase newDatabase, AmtDatabase sourceDatabase,
                          AmtRunEntry runToAdd)
    {
            Map<MS2Modification, MS2Modification> modificationMap =
                    newDatabase.addRunEntry(runToAdd);


            for (AmtPeptideEntry peptideEntry : sourceDatabase.getPeptideEntriesForRun(runToAdd))
            {
                AmtPeptideEntry newEntry = null;
//                    newEntry.setPeptideSequence(peptideEntry.getPeptideSequence());
//                    newEntry.setPredictedHydrophobicity(AmtUtilities.calculateNormalizedHydrophobicity(peptideEntry.getPeptideSequence()));
                for (AmtPeptideEntry.AmtPeptideModificationStateEntry modState : peptideEntry.getModificationStateEntries())
                {
                    AmtPeptideEntry.AmtPeptideObservation obsForRun =
                            modState.getObservationForRun(runToAdd);
                    if (obsForRun != null)
                    {
                        if (newEntry == null)
                        {
                            newEntry = AmtPeptideEntry.createEntryFromObservation(peptideEntry.getPeptideSequence(),
                                    modState.getModifications(), obsForRun);
                        }
                        else
                            newEntry.addObservation(peptideEntry.getPeptideSequence(),
                                    modState.getModifications(), obsForRun);
                    }
                }
                if (newEntry != null)
                    newDatabase.addObservationsFromEntry(newEntry, modificationMap);
            }
            ApplicationContext.infoMessage("Adding run " + runToAdd.getPepXmlFilename());
    }

    /**
     * Pick obserations with a low leverage value.
     *
     * Since there's a linear map from time to hydrophobicity, it doesn't matter
     * whether we base the leverage on time or hydrophobicity. Using hydrophobicity
     *
     * @param observations
     * @return
     */
    protected static AmtPeptideEntry.AmtPeptideObservation[]
        pickObservationsWithLowLeverage(AmtPeptideEntry.AmtPeptideObservation[] observations)
    {
        List<AmtPeptideEntry.AmtPeptideObservation> resultList =
                new ArrayList<AmtPeptideEntry.AmtPeptideObservation>();
        int n = observations.length;
        double[] xValues = new double[n];
        for (int i = 0; i < n; i++)
            xValues[i] = observations[i].getObservedHydrophobicity();
        double[] leverages = BasicStatistics.leverages(xValues);
//todo: parameterize?
        double maxLeverage = AmtDatabaseBuilder.DEFAULT_MAX_LEVERAGE_NUMERATOR / (double) n;
        List<Double> timesForRegressionList = new ArrayList<Double>();
        for (int i = 0; i < n; i++)
        {
            if (leverages[i] < maxLeverage)
            {
                resultList.add(observations[i]);
            }
        }
        return resultList.toArray(new AmtPeptideEntry.AmtPeptideObservation[0]);
    }

    /**
     * Given a set of Strings and an array of candidate sets, find the candidate set
     * with the most overlap with the base.  Ignore any sets in setsToIgnore
     *
     * This would be trivial if HashSets actually implemented the stuff that they
     * should
     * @param baseSet
     * @param candidateSets
     * @param setsToIgnore
     * @return
     */
    protected static int findSetWithMostOverlap(Set<String> baseSet,
                                                Set<String>[] candidateSets,
                                                List<Integer> setsToIgnore)
    {
        int maxOverlap = -1;
        int result = 0;
        for (int i=0; i<candidateSets.length; i++)
        {
            if (setsToIgnore.contains(i))
                continue;
            int currentOverlap = 0;
            for (String baseString : baseSet)
                if (candidateSets[i].contains(baseString))
                    currentOverlap++;
            if (currentOverlap > maxOverlap)
            {
                maxOverlap = currentOverlap;
                result = i;
            }
        }
        return result;
    }

    /**
     * Throw out observations that are above a given multiple of the standard deviation
     * of all observations for that peptide.
     *
     * REMEMBER: for low degrees of freedom (few peptide observations), standard deviation
     * gets wonky.  You really want to compare against a T value instead.
     * But according to Yan, a cutoff of 3 standard deviations is still pretty darn
     * safe.  For 3 peptide observations, it's still equivalent to tossing out <5% of
     * observations. It gets more conservative as you go up in df.
     *
     * So, IDEALLY, the parameter would be percentToThrowAway, rather than
     * hydroStdDevMultipleCutoff, but that's complicated, and this is safe enough.
     *
     * @param amtDB
     * @param hydroStdDevMultipleCutoff
     * @return
     */
    public static AmtPeptideEntry.AmtPeptideObservation[] removeHydrophobicityOutliers(
            AmtDatabase amtDB, double hydroStdDevMultipleCutoff)
    {
        List<AmtPeptideEntry.AmtPeptideObservation> resultList =
                new ArrayList<AmtPeptideEntry.AmtPeptideObservation>();

        //for debug logging
        int[] outliersPerRun = new int[amtDB.numRuns()];

        for (AmtPeptideEntry peptideEntry : amtDB.getEntries())
        {
            double deltaHydroCutoff = hydroStdDevMultipleCutoff *
                                      peptideEntry.getHydrophobicityStandardDeviation();
            if (peptideEntry.getNumObservations() < 3)
                continue;
            for (AmtPeptideEntry.AmtPeptideObservation obs :
                    peptideEntry.getObservations())
            {
                double deltaHydro =
                        Math.abs(peptideEntry.getMedianObservedHydrophobicity() -
                                 obs.getObservedHydrophobicity());
                if (deltaHydro > deltaHydroCutoff)
                {
                    resultList.add(obs);
                    //for debug logging
                    outliersPerRun[amtDB.getSequenceForRun(obs.getRunEntry()) - 1]++;
                    peptideEntry.removeObservation(obs);
                }
            }
        }
        
        //for debug logging
        if (_log.isDebugEnabled())
        {
            int totalOutliers = 0;
            for (int i=0; i<amtDB.numRuns(); i++)
            {
                _log.debug("Run " + (i+1) + ": " + outliersPerRun[i] + " outliers");
                totalOutliers += outliersPerRun[i];
            }
            _log.debug("total outliers: " + totalOutliers);
        }

        return resultList.toArray(new AmtPeptideEntry.AmtPeptideObservation[0]);
    }

    /**
     * This is a utility method to determine minimum and maximum masses for matching individual
     * features by mass.  Returns min and then max masses, in an array.
     * @param features
     * @param massMatchDeltaMass
     * @param massMatchDeltaMassType
     * @return
     */
    public static float[][] findMinAndMaxMassesForMatch(Feature[] features,
                                                        float massMatchDeltaMass,
                                                        int massMatchDeltaMassType)
    {
        float[] minMassesForMatch = new float[features.length];
        float[] maxMassesForMatch = new float[features.length];

        for (int j=0; j<features.length; j++)
        {
            float ms1FeatureMass = features[j].getMass();
            minMassesForMatch[j] = ms1FeatureMass - massMatchDeltaMass;
            maxMassesForMatch[j] = ms1FeatureMass + massMatchDeltaMass;
            if (massMatchDeltaMassType == FeatureSetMatcher.DELTA_MASS_TYPE_PPM)
            {
                float absDeltaMass = (massMatchDeltaMass * minMassesForMatch[j] / 1000000f);
                minMassesForMatch[j] = ms1FeatureMass - absDeltaMass;
                maxMassesForMatch[j] = ms1FeatureMass + absDeltaMass;
            }
        }
        return new float[][] { minMassesForMatch,maxMassesForMatch };
    }

    /**
     * Return a database containing only entries from the subset of runs
     * from the passed-in AMT database that match, by mass only, at least
     * minMassMatchPercent of their entries to ms1Features.
     *
     * Add the best runs first, and stop if db gets too big
     * @param amtDatabase
     * @param ms1FeaturesOrig
     * @param minMassMatchPercent
     * @param massMatchDeltaMass
     * @param massMatchDeltaMassType
     * @param maxNewDBEntries
     * @param modificationsInMS1
     * @param showCharts
     * @return
     */
    public static AmtDatabase removeRunsWithoutMassMatches(AmtDatabase amtDatabase,
                                                       Feature[] ms1FeaturesOrig,
                                                       int minMassMatchPercent,
                                                       float massMatchDeltaMass,
                                                       int massMatchDeltaMassType,
                                                       int maxNewDBEntries,
                                                       int maxNewDBRuns,
                                                       MS2Modification[] modificationsInMS1,
                                                       boolean showCharts)
    {
        Feature[] ms1FeaturesCopy = new Feature[ms1FeaturesOrig.length];
        System.arraycopy(ms1FeaturesOrig, 0, ms1FeaturesCopy, 0, ms1FeaturesCopy.length);
        Arrays.sort(ms1FeaturesCopy, new Feature.MassAscComparator());

        AmtDatabaseMatcher amtDatabaseMatcher = new AmtDatabaseMatcher();

        AmtDatabase result = new AmtDatabase();
        float[] percentMatchedPerRun = new float[amtDatabase.numRuns()];
        AmtRunEntry[] runEntries = amtDatabase.getRuns();

        float[][] minAndMaxMassesForMatch = 
                findMinAndMaxMassesForMatch(ms1FeaturesCopy, massMatchDeltaMass, massMatchDeltaMassType);
        float[] minMassesForMatch = minAndMaxMassesForMatch[0];
        float[] maxMassesForMatch = minAndMaxMassesForMatch[1];

        final Map<AmtRunEntry, Float> runMassMatchPercentMap =
                new HashMap<AmtRunEntry, Float>();

        int numPassing = 0;
        for (int i=0; i<runEntries.length; i++)
        {
            if (i % (Math.max(runEntries.length/10,1)) == 0)
            {
                ApplicationContext.setMessage(Rounder.round(((double) i * 100.0 / (double) runEntries.length),0) +
                        "% done evaluating runs; passing: " + numPassing + " / " + i +
                        " (" + Rounder.round(((double) numPassing  * 100.0) / (double) i, 0) + "%)...");
            }
            AmtRunEntry runEntry = runEntries[i];
            AmtDatabaseFeatureSetGenerator featureGen =
                    new AmtDatabaseFeatureSetGenerator(amtDatabase);
            FeatureSet runFeatureSet =
                    featureGen.createFeatureSetForRun(runEntry, modificationsInMS1);
            Feature[] runFeatures = runFeatureSet.getFeatures();
            Arrays.sort(runFeatures, new Feature.MassAscComparator());

            int numMassMatchedFeatures = 0;

            int ms1FeaturesIndex = 0;
            int runFeaturesIndex = 0;
            while (runFeaturesIndex < runFeatures.length &&
                   ms1FeaturesIndex < ms1FeaturesCopy.length)
            {
                while (runFeaturesIndex < runFeatures.length &&
                       runFeatures[runFeaturesIndex].getMass() <=
                        minMassesForMatch[ms1FeaturesIndex])
                    runFeaturesIndex++;
                if (runFeaturesIndex >= runFeatures.length)
                    break;
                if (runFeatures[runFeaturesIndex].getMass() <=
                        maxMassesForMatch[ms1FeaturesIndex])
                    numMassMatchedFeatures++;
                ms1FeaturesIndex++;
            }
            float massMatchesPercent =
                    (((float) numMassMatchedFeatures) / ((float) runFeatureSet.getFeatures().length) * 100);
//            System.err.println("Matches this run: " + numMassMatchedFeatures + " (" + massMatchesPercent + "%)");
            percentMatchedPerRun[i] = massMatchesPercent;

            runMassMatchPercentMap.put(runEntry, massMatchesPercent);

            if (massMatchesPercent >= minMassMatchPercent)
                numPassing++;
        }

        if (numPassing == 0)
        {
            ApplicationContext.infoMessage("No runs matched enough MS1 features.  Quitting");
            return result;
        }

        AmtRunEntry[] runEntriesSorted = new AmtRunEntry[runEntries.length];
        for (int i=0; i<runEntries.length; i++)
            runEntriesSorted[i] = runEntries[i];

        Comparator<AmtRunEntry> runComparatorByMassMatchPercentDesc =
                new Comparator<AmtRunEntry> ()
        {
            public int compare(AmtRunEntry o1, AmtRunEntry o2)
            {
                float o1Matches = runMassMatchPercentMap.get(o1);
                float o2Matches = runMassMatchPercentMap.get(o2);

                return o1Matches < o2Matches ? 1 : o1Matches > o2Matches ? -1 : 0;
            }
        };

        Arrays.sort(runEntriesSorted, runComparatorByMassMatchPercentDesc);
        ApplicationContext.infoMessage("Runs passing: " + numPassing +
                ".  Adding runs to database...");
        for (AmtRunEntry runEntry : runEntriesSorted)
        {
            float massMatchesPercent = runMassMatchPercentMap.get(runEntry);

            //since runs are sorted, stop if this run doesn't pass
            if (massMatchesPercent < minMassMatchPercent)
                break;

            int numPeptidesIfRunAdded = result.numEntries();
            for (AmtPeptideEntry peptideEntry : amtDatabase.getPeptideEntriesForRun(runEntry))
                if (!result.contains(peptideEntry.getPeptideSequence()))
                    numPeptidesIfRunAdded++;

            if (numPeptidesIfRunAdded > maxNewDBEntries)
            {
                ApplicationContext.setMessage("Too many entries in passing runs, stopping early to keep DB small");
                ApplicationContext.setMessage("Adding " + result.numRuns() + " runs, out of " + numPassing + " mass-matched");
                break;
            }

            Map<MS2Modification, MS2Modification> modificationMap =
                    result.addRunEntry(runEntry);

            for (AmtPeptideEntry peptideEntry : amtDatabase.getPeptideEntriesForRun(runEntry))
            {
                AmtPeptideEntry newEntry = null;
//                    newEntry.setPeptideSequence(peptideEntry.getPeptideSequence());
//                    newEntry.setPredictedHydrophobicity(AmtUtilities.calculateNormalizedHydrophobicity(peptideEntry.getPeptideSequence()));
                for (AmtPeptideEntry.AmtPeptideModificationStateEntry modState : peptideEntry.getModificationStateEntries())
                {
                    AmtPeptideEntry.AmtPeptideObservation obsForRun =
                            modState.getObservationForRun(runEntry);
                    if (obsForRun != null)
                    {
                        if (newEntry == null)
                        {
                            newEntry = AmtPeptideEntry.createEntryFromObservation(peptideEntry.getPeptideSequence(),
                                    modState.getModifications(), obsForRun);
                        }
                        else
                            newEntry.addObservation(peptideEntry.getPeptideSequence(),
                                    modState.getModifications(), obsForRun);
                    }
                }
                if (newEntry != null)
                    result.addObservationsFromEntry(newEntry, modificationMap);


            }
            _log.debug("Adding run " + runEntry.getPepXmlFilename() +
                    ", mass matches: " + massMatchesPercent + "%");

            if (result.numRuns() >= maxNewDBRuns)
            {
                ApplicationContext.setMessage("Too many passing runs, stopping early to keep DB small");
                ApplicationContext.setMessage("Adding " + result.numRuns() + " runs, out of " + numPassing + " mass-matched");
                break;
            }
        }

        if (result.numRuns() == 0)
        {
            ApplicationContext.infoMessage("Too many entries in a single run!  Quitting");
            return result;
        }

        ApplicationContext.infoMessage(result.numRuns() + " out of " + amtDatabase.numRuns() + " runs kept");
        ApplicationContext.infoMessage("Database summary:  " + result);
        return result;
    }


    public static AmtDatabase removeRunsByStructure(AmtDatabase amtDatabase,
                                                       int ms1RunCol, int ms1RunRow, int ms1RunExp,
                                                       int maxNewDBEntries,
                                                       int maxNewDBRuns,
                                                       Fractionation2DUtilities.FractionatedAMTDatabaseStructure amtDatabaseStructure,
                                                       boolean showCharts)
    {
        final Map<AmtRunEntry, Float> runDistanceMap =
                new HashMap<AmtRunEntry, Float>();
        AmtRunEntry[] runEntries = amtDatabase.getRuns();

        for (int i=0; i<runEntries.length; i++)
        {
            Pair<Integer, int[]> expAndPos = amtDatabaseStructure.calculateExperimentAndPosition(i);
            int runEntryCol = expAndPos.second[0];
            int runEntryRow = expAndPos.second[1];

            //manhattan distance
            float distance = Math.abs(runEntryCol - ms1RunCol) + Math.abs(runEntryRow - ms1RunRow);
            runDistanceMap.put(runEntries[i], distance);
        }

        AmtRunEntry[] runEntriesSorted = new AmtRunEntry[runEntries.length];
        for (int i=0; i<runEntriesSorted.length; i++)
            runEntriesSorted[i] = runEntries[i];

        Comparator<AmtRunEntry> runComparatorByDistanceAsc =
                new Comparator<AmtRunEntry> ()
        {
            public int compare(AmtRunEntry o1, AmtRunEntry o2)
            {
                float o1Matches = runDistanceMap.get(o1);
                float o2Matches = runDistanceMap.get(o2);

                return o1Matches < o2Matches ? -1 : o1Matches > o2Matches ? 1 : 0;
            }
        };

        Arrays.sort(runEntriesSorted, runComparatorByDistanceAsc);


        return removeRunsInOrder(amtDatabase, maxNewDBEntries, maxNewDBRuns, amtDatabaseStructure,
                                 runEntriesSorted, showCharts, null, null, null, null);
    }

    protected static Set<String> createPeptideSetFromFeatures(Feature[] features)
    {
        Set<String> peptides = new HashSet<String>();
        for (Feature feature : features)
        {
            String peptide = MS2ExtraInfoDef.getFirstPeptide(feature);
            if (peptide != null)
                peptides.add(peptide);
        }
        return peptides;
    }

    /**
     * Remove runs with insufficient peptide matches to the MS/MS features passed in
     * @param amtDatabase
     * @param ms2Features
     * @param minPeptideMatchPercent
     * @param maxNewDBEntries
     * @param maxNewDBRuns
     * @param showCharts
     * @return
     */
    public static AmtDatabase removeRunsWithoutPeptideMatches(AmtDatabase amtDatabase,
                                                       Feature[] ms2Features,
                                                       int minPeptideMatchPercent,
                                                       int maxNewDBEntries,
                                                       int maxNewDBRuns,
                                                       Fractionation2DUtilities.FractionatedAMTDatabaseStructure amtDatabaseStructure,
                                                       boolean showCharts,
                                                       Feature[] ms1Features,
                                                       MS2Modification[] ms2ModificationsForMatching,
                                                       AmtDatabaseMatcher dbMatcher)
    {
        float[] percentMatchedPerRun = new float[amtDatabase.numRuns()];
        AmtRunEntry[] runEntries = amtDatabase.getRuns();

        final Map<AmtRunEntry, Integer> runPeptideMatchNumberMap =
                new HashMap<AmtRunEntry, Integer>();
        final Map<AmtRunEntry, Float> runPeptideMatchPercentMap =
                new HashMap<AmtRunEntry, Float>();

        Set<String> ms2Peptides = createPeptideSetFromFeatures(ms2Features);

        int numPassing = 0;
        for (int i=0; i<runEntries.length; i++)
        {
            if (i % (Math.max(runEntries.length/10,1)) == 0)
            {
                ApplicationContext.setMessage(Rounder.round(((double) i * 100.0 / (double) runEntries.length),0) +
                        "% done evaluating runs; passing: " + numPassing + " / " + i +
                        " (" + Rounder.round(((double) numPassing  * 100.0) / (double) i, 0) + "%)...");
            }
            AmtRunEntry runEntry = runEntries[i];

            int numPeptidesThisRun = 0;
            int numPeptidesInCommon = 0;
            for (AmtPeptideEntry peptideEntry : amtDatabase.getEntries())
            {
                String peptideSequence = peptideEntry.getPeptideSequence();
                AmtPeptideEntry.AmtPeptideObservation obs =
                        peptideEntry.getObservationForRun(runEntry);
                if (obs != null)
                {
                    numPeptidesThisRun++;
                    if (ms2Peptides.contains(peptideSequence))
                        numPeptidesInCommon++;
                }
            }

            float peptideMatchesPercent =
                    (((float) numPeptidesInCommon) /
                            ((float) numPeptidesThisRun) * 100);
//            System.err.println("Matches this run: " + numMassMatchedFeatures + " (" + massMatchesPercent + "%)");
            percentMatchedPerRun[i] = peptideMatchesPercent;

            runPeptideMatchPercentMap.put(runEntry, peptideMatchesPercent);
            runPeptideMatchNumberMap.put(runEntry, numPeptidesInCommon);

            if (peptideMatchesPercent >= minPeptideMatchPercent)
                numPassing++;
        }

        if (numPassing == 0)
        {
            ApplicationContext.infoMessage("No runs matched enough MS1 features.  Quitting");
            return new AmtDatabase();
        }

        if (showCharts && amtDatabaseStructure != null)
        {
            double[] peptideMatchesPerRun = new double[amtDatabase.numRuns()];
            for (int i=0; i<peptideMatchesPerRun.length; i++)
                peptideMatchesPerRun[i] = runPeptideMatchNumberMap.get(amtDatabase.getRunBySequence(i+1));

            Fractionation2DUtilities.showHeatMapChart(
                    amtDatabaseStructure,
                    peptideMatchesPerRun, "AMT Database Run Peptides In Common with This Run",
                    true);

            double[] peptidePercentMatchesPerRun = new double[amtDatabase.numRuns()];
            for (int i=0; i<peptidePercentMatchesPerRun.length; i++)
                peptidePercentMatchesPerRun[i] = runPeptideMatchPercentMap.get(amtDatabase.getRunBySequence(i+1));

            Fractionation2DUtilities.showHeatMapChart(
                    amtDatabaseStructure,
                    peptidePercentMatchesPerRun, "AMT Database Run Peptide % In Common with This Run",
                    true);
        }

        List<AmtRunEntry> passingRunEntriesList = new ArrayList<AmtRunEntry>(runEntries.length);
        for (AmtRunEntry runEntry : runEntries)
            if (runPeptideMatchPercentMap.get(runEntry) >= minPeptideMatchPercent)
                passingRunEntriesList.add(runEntry);

        AmtRunEntry[] runEntriesSorted = new AmtRunEntry[passingRunEntriesList.size()];
        for (int i=0; i<runEntriesSorted.length; i++)
            runEntriesSorted[i] = passingRunEntriesList.get(i);

        Comparator<AmtRunEntry> runComparatorByPeptideMatchPercentDesc =
                new Comparator<AmtRunEntry> ()
        {
            public int compare(AmtRunEntry o1, AmtRunEntry o2)
            {
                float o1Matches = runPeptideMatchPercentMap.get(o1);
                float o2Matches = runPeptideMatchPercentMap.get(o2);

                return o1Matches < o2Matches ? 1 : o1Matches > o2Matches ? -1 : 0;
            }
        };

        Comparator<AmtRunEntry> runComparatorByPeptideMatchNumberDesc =
                new Comparator<AmtRunEntry> ()
        {
            public int compare(AmtRunEntry o1, AmtRunEntry o2)
            {
                float o1Matches = runPeptideMatchNumberMap.get(o1);
                float o2Matches = runPeptideMatchNumberMap.get(o2);

                return o1Matches < o2Matches ? 1 : o1Matches > o2Matches ? -1 : 0;
            }
        };

        Arrays.sort(runEntriesSorted, runComparatorByPeptideMatchPercentDesc);

        ApplicationContext.infoMessage("Runs passing: " + numPassing +
                ".  Adding runs to database...");
        return removeRunsInOrder(amtDatabase, maxNewDBEntries, maxNewDBRuns, amtDatabaseStructure,
                                 runEntriesSorted, showCharts, ms1Features, ms2ModificationsForMatching,
                                 ms2Features, dbMatcher);

    }

    /**
     * Trims down an AMT database by adding runs one by one from runEntriesSorted, stopping when
     * adding another run would violate maxNewDBEntries or maxNewDBRuns. 
     * @param amtDatabase
     * @param maxNewDBEntries
     * @param maxNewDBRuns
     * @param amtDatabaseStructure
     * @param runEntriesSorted  sorted in _descending_ order of goodness
     * @param showCharts
     * @return
     */
    public static AmtDatabase removeRunsInOrder(AmtDatabase amtDatabase,
                                                int maxNewDBEntries,
                                                int maxNewDBRuns,
                                                Fractionation2DUtilities.FractionatedAMTDatabaseStructure amtDatabaseStructure,
                                                AmtRunEntry[] runEntriesSorted,
                                                boolean showCharts,
                                                Feature[] ms1Features,
                                                MS2Modification[] ms2ModificationsForMatching,
                                                Feature[] ms2Features,
                                                AmtDatabaseMatcher databaseMatcher)
    {
        //if true, we will engage in an extremely costly plot of FAR vs. number of runs kept in the database.
        //This was done in order to produce that plot for the HUPO conference.  Not normally useful, as
        //it's very hand-wavy
        boolean plotFarVsNumRuns = false;

        AmtDatabase result = new AmtDatabase();

        int numSortedRuns = runEntriesSorted.length;

        //variables for db density vs. FAR chart
        List<Double> farsWithNumbersOfRuns = new ArrayList<Double>();
        List<Double> numEntriesWithNumbersOfRuns = new ArrayList<Double>();
        

        List<AmtRunEntry> runsAdded = new ArrayList<AmtRunEntry>();
        for (AmtRunEntry runEntry : runEntriesSorted)
        {
            int numPeptidesIfRunAdded = result.numEntries();
            for (AmtPeptideEntry peptideEntry : amtDatabase.getPeptideEntriesForRun(runEntry))
                if (!result.contains(peptideEntry.getPeptideSequence()))
                    numPeptidesIfRunAdded++;

            if (numPeptidesIfRunAdded > maxNewDBEntries)
            {
                ApplicationContext.setMessage("Too many entries in passing runs, stopping early to keep DB small");
                ApplicationContext.setMessage("Adding " + result.numRuns() + " runs, out of " + numSortedRuns);
                break;
            }
            if (result.numRuns() + 1 >= maxNewDBRuns)
            {
                ApplicationContext.setMessage("Too many passing runs, stopping early to keep DB small");
                ApplicationContext.setMessage("Adding " + result.numRuns() + " runs, out of " + numSortedRuns);
                break;
            }

            Map<MS2Modification, MS2Modification> modificationMap =
                    result.addRunEntry(runEntry);

            for (AmtPeptideEntry peptideEntry : amtDatabase.getPeptideEntriesForRun(runEntry))
            {
                AmtPeptideEntry newEntry = null;
                for (AmtPeptideEntry.AmtPeptideModificationStateEntry modState :
                        peptideEntry.getModificationStateEntries())
                {
                    AmtPeptideEntry.AmtPeptideObservation obsForRun =
                            modState.getObservationForRun(runEntry);
                    if (obsForRun != null)
                    {
                        if (newEntry == null)
                        {
                            newEntry = AmtPeptideEntry.createEntryFromObservation(peptideEntry.getPeptideSequence(),
                                    modState.getModifications(), obsForRun);
                        }
                        else
                            newEntry.addObservation(peptideEntry.getPeptideSequence(),
                                    modState.getModifications(), obsForRun);
                    }
                }
                if (newEntry != null)
                    result.addObservationsFromEntry(newEntry, modificationMap);


            }
            runsAdded.add(runEntry);
            _log.debug("Adding run " + runEntry.getPepXmlFilename() + " (#" + result.numRuns() + ")");
        }

        if (result.numRuns() == 0)
        {
            ApplicationContext.infoMessage("Too many entries in a single run!  Quitting");
            return result;
        }

        if (showCharts && amtDatabaseStructure != null)
        {
            double[] matchedRuns = new double[amtDatabase.numRuns()];
            for (AmtRunEntry runEntry : amtDatabase.getRuns())
            {
                if (runsAdded.contains(runEntry))
                {
                    matchedRuns[amtDatabase.getSequenceForRun(runEntry)-1]++;
                }
            }

            Fractionation2DUtilities.showHeatMapChart(
                    amtDatabaseStructure,
                    matchedRuns, "AMT Runs Kept For Matching",
                    false);
        }

        //Only for creating an extremely costly plot        
        if (showCharts && ms1Features != null && ms2ModificationsForMatching != null && plotFarVsNumRuns)
        {
            List<Double> numbersOfRuns = new ArrayList<Double>();
            for (int i=0; i<farsWithNumbersOfRuns.size(); i++)
                numbersOfRuns.add((double)(i+1));
            System.err.println("Line chart.  Size: " + farsWithNumbersOfRuns);
            PanelWithLineChart pwlc = new PanelWithLineChart(numbersOfRuns, farsWithNumbersOfRuns,
                    "FAR vs. Number of Fractions");
            ChartDialog cd = new ChartDialog(pwlc);
            cd.setTitle("Line chart: FAR vs. number of fractions");
            cd.setVisible(true);

            PanelWithLineChart pwlc2 = new PanelWithLineChart(numEntriesWithNumbersOfRuns, farsWithNumbersOfRuns,
                    "FAR vs. Number of Entries");
            ChartDialog cd2 = new ChartDialog(pwlc2);
            cd2.setTitle("Line chart: FAR vs. number of entries");
            cd2.setVisible(true);
        }

        ApplicationContext.infoMessage(result.numRuns() + " out of " + amtDatabase.numRuns() + " runs kept");
        ApplicationContext.infoMessage("Database summary:  " + result);
        return result;
    }



}