/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.viewer.amt; import java.util.*; import org.apache.log4j.Logger; import org.fhcrc.cpl.toolbox.proteomics.feature.Feature; import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet; import org.fhcrc.cpl.toolbox.proteomics.feature.matching.FeatureSetMatcher; import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.AmtExtraInfoDef; import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef; import org.fhcrc.cpl.viewer.ms2.Fractionation2DUtilities; import org.fhcrc.cpl.toolbox.*; import org.fhcrc.cpl.toolbox.statistics.BasicStatistics; import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities; import org.fhcrc.cpl.toolbox.datastructure.Pair; import org.fhcrc.cpl.toolbox.gui.chart.*; import org.fhcrc.cpl.toolbox.proteomics.MS2Modification; /** * Manages AMT databases -- prunes outliers of various kinds * */ public class AmtDatabaseManager { static Logger _log = Logger.getLogger(AmtDatabaseManager.class); //default parameters for median alignment public static final int DEFAULT_MIN_PEPTIDES_FOR_ALIGNMENT_REGRESSION=30; public static final int DEFAULT_MIN_OBSERVATIONS_FOR_ALIGNMENT_REGRESSION=3; //parameters for database self-alignment public static final int DEFAULT_MATCHING_DEGREE_FOR_DB_ALIGNMENT = AmtDatabaseMatcher.DEFAULT_NONLINEAR_MAPPING_DEGREE; /** * Remove entries with only one observation, where that observation differs from * predicted H by more than significantHydroDifference * @param amtDB * @param significantHydroDifference */ public static void removePredictedHOutliers(AmtDatabase amtDB, float significantHydroDifference) { int excludedEntryCount = 0; for (AmtPeptideEntry peptideEntry : amtDB.getEntries()) { //if there's only one observation for this peptide entry, and we're conditionally //excluding single-observation entries... if (peptideEntry.getNumObservations() < 2) { //if the single obseration is too far off from the predicted hydrophobicity... if (Math.abs(peptideEntry.getPredictedHydrophobicity() - peptideEntry.getMedianObservedHydrophobicity()) > significantHydroDifference) { amtDB.removeEntry(peptideEntry.getPeptideSequence()); excludedEntryCount++; } } } ApplicationContext.infoMessage("Removed " + excludedEntryCount + " entries with suspicious observed H values"); } /** * Adjust AMT database entries to account for the effect of acrylamide * @param amtDB Altered in place * @param fromAcrylamideToNot If true, adjust acrylamide-containing entries to remove acrylamide's effect. * If false, adjust non-acrylamide-containing entries to add the effect of acrylamide * @return */ public static AmtDatabase adjustEntriesForAcrylamide(AmtDatabase amtDB, boolean fromAcrylamideToNot, boolean showCharts) { List<Double> oldHVals = new ArrayList<Double>(); List<Double> newHVals = new ArrayList<Double>(); for (AmtPeptideEntry peptideEntry : amtDB.getEntries()) { for (AmtPeptideEntry.AmtPeptideObservation obs : peptideEntry.getObservations()) { double oldHVal = obs.getObservedHydrophobicity(); double newHydrophobicity; if (fromAcrylamideToNot) newHydrophobicity = AcrylamideHydrophobicityAdjuster.removeAcrylamideEffect( peptideEntry.getPeptideSequence(), oldHVal); else newHydrophobicity = AcrylamideHydrophobicityAdjuster.adjustHForAcrylamide( peptideEntry.getPeptideSequence(), oldHVal); obs.setObservedHydrophobicity(newHydrophobicity); oldHVals.add(oldHVal); newHVals.add(newHydrophobicity); } peptideEntry.recalculateStats(); } if (showCharts) { ScatterPlotDialog spd = new ScatterPlotDialog(oldHVals, newHVals, "New vs. Old hydrohpobicity values"); spd.setVisible(true); } return amtDB; } protected static Set<String> intersection(Set<String> overlapSet1, Set<String> overlapSet2) { Set<String> intersSet = new HashSet<String>(overlapSet1); intersSet.retainAll(overlapSet2); return intersSet; } /** * ** * @param amtDB * @param minMatchedPeptides */ public static AmtDatabase alignAllRunsUsingCommonPeptides(AmtDatabase amtDB, int minMatchedPeptides, int matchingdegree, float maxLeverageNumerator, float maxStudentizedResidual, boolean showCharts) { Map<AmtRunEntry, Set<String>> peptidesInRuns = new HashMap<AmtRunEntry, Set<String>>(amtDB.numRuns()); int numAllRuns = amtDB.numRuns(); ApplicationContext.infoMessage("Choosing best runs to start with..."); for (AmtRunEntry run : amtDB.getRuns()) { Set<String> peptidesThisRun = new HashSet<String>(); for (AmtPeptideEntry peptideEntry1 : amtDB.getPeptideEntriesForRun(run)) peptidesThisRun.add(peptideEntry1.getPeptideSequence()); peptidesInRuns.put(run, peptidesThisRun); } Pair<AmtRunEntry, AmtRunEntry> closestRuns = null; int numMatchedBetweenClosest = 0; Set<String> closestPeptideOverlap = null; for (AmtRunEntry run1 : amtDB.getRuns()) { Set<String> run1Peptides = peptidesInRuns.get(run1); for (AmtRunEntry run2 : amtDB.getRuns()) { Set<String> run2Peptides = peptidesInRuns.get(run2); //if it can't possibly be the best one, skip it if (run2Peptides.size() < numMatchedBetweenClosest) continue; Set<String> matchedPeptides = intersection(run1Peptides, run2Peptides); if (matchedPeptides.size() > numMatchedBetweenClosest) { closestPeptideOverlap = matchedPeptides; numMatchedBetweenClosest = matchedPeptides.size(); closestRuns = new Pair<AmtRunEntry, AmtRunEntry>(run1, run2); } } } if (numMatchedBetweenClosest < minMatchedPeptides) throw new RuntimeException("No two runs had enough in common to align"); ApplicationContext.infoMessage("Adding initial runs..."); AmtDatabase result = new AmtDatabase(); AmtRunEntry baseRun = closestRuns.first; addRun(result, amtDB, baseRun); Map<AmtRunEntry, double[]> runTHMappingCoefficientMap = new HashMap<AmtRunEntry, double[]>(); runTHMappingCoefficientMap.put(closestRuns.second, alignRun(result, amtDB, closestRuns.second, closestPeptideOverlap, matchingdegree, maxLeverageNumerator, maxStudentizedResidual, showCharts)); addRun(result, amtDB, closestRuns.second); Set<String> peptidesAddedToDatabase = new HashSet<String>(); peptidesAddedToDatabase.addAll(peptidesInRuns.get(baseRun)); peptidesAddedToDatabase.addAll(peptidesInRuns.get(closestRuns.second)); Collection<AmtRunEntry> runsToAlign = new ArrayList<AmtRunEntry>(); for (AmtRunEntry run : amtDB.getRuns()) { if (run != baseRun && run != closestRuns.second) runsToAlign.add(run); } ApplicationContext.infoMessage("Adding the rest of the runs..."); //keep track of the peptides already in the database that also exist in all runs Map<AmtRunEntry, Set<String>> databasePeptidesInRuns = new HashMap<AmtRunEntry, Set<String>>(amtDB.numRuns()); for (AmtRunEntry runEntry : runsToAlign) databasePeptidesInRuns.put(runEntry, intersection(peptidesAddedToDatabase, peptidesInRuns.get(runEntry))); PanelWithChart runningChartPanel = null; while (runsToAlign.size() > 0) { ApplicationContext.setMessage("Runs remaining: " + runsToAlign.size() + " (" + Rounder.round(100.0 * (float)(numAllRuns-runsToAlign.size())/(float)numAllRuns, 2) + "% complete)"); AmtRunEntry nextRun = null; int maxOverlapSize = -1; for (AmtRunEntry possibleNextRun : runsToAlign) { Set<String> possibleNextRunOverlap = databasePeptidesInRuns.get(possibleNextRun); possibleNextRunOverlap.addAll(intersection(peptidesAddedToDatabase, peptidesInRuns.get(possibleNextRun))); if (possibleNextRunOverlap.size() > maxOverlapSize) { maxOverlapSize = possibleNextRunOverlap.size(); nextRun = possibleNextRun; } } if (maxOverlapSize < minMatchedPeptides) { ApplicationContext.infoMessage("Unable to align remaining " + runsToAlign.size() + " runs, excluding them"); break; } runTHMappingCoefficientMap.put(nextRun, alignRun(result, amtDB, nextRun, databasePeptidesInRuns.get(nextRun), matchingdegree, maxLeverageNumerator, maxStudentizedResidual, showCharts)); addRun(result, amtDB, nextRun); runsToAlign.remove(nextRun); //cleanup databasePeptidesInRuns.remove(nextRun); } if (showCharts) { double maxTimeAnyRun = 0; for (AmtRunEntry run : amtDB.getRuns()) { for (AmtPeptideEntry.AmtPeptideObservation obs : amtDB.getObservationsForRun(run)) if (obs.getTimeInRun() > maxTimeAnyRun) maxTimeAnyRun = obs.getTimeInRun(); } PanelWithScatterPlot pwsp = new PanelWithScatterPlot(); pwsp.setName("Realignment of DB runs"); for (double[] thMapCoefficients : runTHMappingCoefficientMap.values()) { int numDotsOnChart = ((int) maxTimeAnyRun+1) /2; double[] mappingXVals = new double[numDotsOnChart]; double[] mappingYVals = new double[numDotsOnChart]; for (int j=0; j<numDotsOnChart; j++) { mappingXVals[j] = 2 * j; mappingYVals[j] = RegressionUtilities.mapValueUsingCoefficients(thMapCoefficients, j); } pwsp.addData(mappingXVals, mappingYVals, ""); } // spd.addData(reverseRegressionLineXVals, reverseRegressionLineYVals, "reverse regression line"); // spd.addData(regressionLineXVals, regressionLineYVals, "regression line"); // spd.addData(lowStudResMs1Times, lowStudResMs2H, "low studentized residual"); pwsp.setAxisLabels("MS1 time","AMT Hydrophobicity"); pwsp.displayInTab(); } return result; } protected static double[] alignRun(AmtDatabase newDatabase, AmtDatabase sourceDatabase, AmtRunEntry runToAdd, Set<String> peptideOverlap, int matchingDegree, float maxLeverageNumerator, float maxStudentizedResidual, boolean showCharts) { List<Pair<Feature, Feature>> peptideMatches = new ArrayList<Pair<Feature, Feature>>(); for (String commonPeptide : peptideOverlap) { AmtPeptideEntry newDatabaseEntry = newDatabase.getEntry(commonPeptide); AmtPeptideEntry.AmtPeptideObservation runToAddObservation = sourceDatabase.getEntry(commonPeptide).getObservationForRun(runToAdd); Feature newDatabaseFeature = new Feature(); AmtExtraInfoDef.setObservedHydrophobicity(newDatabaseFeature, newDatabaseEntry.getMedianObservedHydrophobicity()); Feature runToAddFeature = new Feature(); runToAddFeature.setTime((float) runToAddObservation.getTimeInRun()); peptideMatches.add(new Pair<Feature,Feature>(runToAddFeature, newDatabaseFeature)); } AmtDatabaseMatcher matcher = new AmtDatabaseMatcher(); matcher.setMaxRegressionStudentizedResidual(maxStudentizedResidual); matcher.setMaxRegressionLeverageNumerator(maxLeverageNumerator); matcher.setBuildCharts(true); float maxTimeForMap = 0; for (AmtPeptideEntry.AmtPeptideObservation obs : sourceDatabase.getObservationsForRun(runToAdd)) { if (obs.getTimeInRun() > maxTimeForMap) maxTimeForMap = (float) obs.getTimeInRun() + 1; } double[] timeHydMapCoefficients = matcher.calculateTHMapCoefficientsWithMatchedFeatures( peptideMatches.toArray((Pair<Feature,Feature>[]) new Pair[peptideMatches.size()]), matchingDegree); runToAdd.setTimeHydMapCoefficients(timeHydMapCoefficients); for (AmtPeptideEntry.AmtPeptideObservation newRunObs : sourceDatabase.getObservationsForRun(runToAdd)) { double newHydrophobicity = RegressionUtilities.mapValueUsingCoefficients(timeHydMapCoefficients, newRunObs.getTimeInRun()); newRunObs.setObservedHydrophobicity(newHydrophobicity); } if (showCharts) { if (MultiChartDisplayPanel.getSingletonInstance().getNumCharts() > 0) MultiChartDisplayPanel.getSingletonInstance().removeAllCharts(); PanelWithChart pwc = new PanelWithChart(matcher.getTimeHydrophobicityMappingChart()); pwc.setName("Alignment for run " + newDatabase.numRuns()); pwc.displayInTab(); } return timeHydMapCoefficients; } protected static void addRun(AmtDatabase newDatabase, AmtDatabase sourceDatabase, AmtRunEntry runToAdd) { Map<MS2Modification, MS2Modification> modificationMap = newDatabase.addRunEntry(runToAdd); for (AmtPeptideEntry peptideEntry : sourceDatabase.getPeptideEntriesForRun(runToAdd)) { AmtPeptideEntry newEntry = null; // newEntry.setPeptideSequence(peptideEntry.getPeptideSequence()); // newEntry.setPredictedHydrophobicity(AmtUtilities.calculateNormalizedHydrophobicity(peptideEntry.getPeptideSequence())); for (AmtPeptideEntry.AmtPeptideModificationStateEntry modState : peptideEntry.getModificationStateEntries()) { AmtPeptideEntry.AmtPeptideObservation obsForRun = modState.getObservationForRun(runToAdd); if (obsForRun != null) { if (newEntry == null) { newEntry = AmtPeptideEntry.createEntryFromObservation(peptideEntry.getPeptideSequence(), modState.getModifications(), obsForRun); } else newEntry.addObservation(peptideEntry.getPeptideSequence(), modState.getModifications(), obsForRun); } } if (newEntry != null) newDatabase.addObservationsFromEntry(newEntry, modificationMap); } ApplicationContext.infoMessage("Adding run " + runToAdd.getPepXmlFilename()); } /** * Pick obserations with a low leverage value. * * Since there's a linear map from time to hydrophobicity, it doesn't matter * whether we base the leverage on time or hydrophobicity. Using hydrophobicity * * @param observations * @return */ protected static AmtPeptideEntry.AmtPeptideObservation[] pickObservationsWithLowLeverage(AmtPeptideEntry.AmtPeptideObservation[] observations) { List<AmtPeptideEntry.AmtPeptideObservation> resultList = new ArrayList<AmtPeptideEntry.AmtPeptideObservation>(); int n = observations.length; double[] xValues = new double[n]; for (int i = 0; i < n; i++) xValues[i] = observations[i].getObservedHydrophobicity(); double[] leverages = BasicStatistics.leverages(xValues); //todo: parameterize? double maxLeverage = AmtDatabaseBuilder.DEFAULT_MAX_LEVERAGE_NUMERATOR / (double) n; List<Double> timesForRegressionList = new ArrayList<Double>(); for (int i = 0; i < n; i++) { if (leverages[i] < maxLeverage) { resultList.add(observations[i]); } } return resultList.toArray(new AmtPeptideEntry.AmtPeptideObservation[0]); } /** * Given a set of Strings and an array of candidate sets, find the candidate set * with the most overlap with the base. Ignore any sets in setsToIgnore * * This would be trivial if HashSets actually implemented the stuff that they * should * @param baseSet * @param candidateSets * @param setsToIgnore * @return */ protected static int findSetWithMostOverlap(Set<String> baseSet, Set<String>[] candidateSets, List<Integer> setsToIgnore) { int maxOverlap = -1; int result = 0; for (int i=0; i<candidateSets.length; i++) { if (setsToIgnore.contains(i)) continue; int currentOverlap = 0; for (String baseString : baseSet) if (candidateSets[i].contains(baseString)) currentOverlap++; if (currentOverlap > maxOverlap) { maxOverlap = currentOverlap; result = i; } } return result; } /** * Throw out observations that are above a given multiple of the standard deviation * of all observations for that peptide. * * REMEMBER: for low degrees of freedom (few peptide observations), standard deviation * gets wonky. You really want to compare against a T value instead. * But according to Yan, a cutoff of 3 standard deviations is still pretty darn * safe. For 3 peptide observations, it's still equivalent to tossing out <5% of * observations. It gets more conservative as you go up in df. * * So, IDEALLY, the parameter would be percentToThrowAway, rather than * hydroStdDevMultipleCutoff, but that's complicated, and this is safe enough. * * @param amtDB * @param hydroStdDevMultipleCutoff * @return */ public static AmtPeptideEntry.AmtPeptideObservation[] removeHydrophobicityOutliers( AmtDatabase amtDB, double hydroStdDevMultipleCutoff) { List<AmtPeptideEntry.AmtPeptideObservation> resultList = new ArrayList<AmtPeptideEntry.AmtPeptideObservation>(); //for debug logging int[] outliersPerRun = new int[amtDB.numRuns()]; for (AmtPeptideEntry peptideEntry : amtDB.getEntries()) { double deltaHydroCutoff = hydroStdDevMultipleCutoff * peptideEntry.getHydrophobicityStandardDeviation(); if (peptideEntry.getNumObservations() < 3) continue; for (AmtPeptideEntry.AmtPeptideObservation obs : peptideEntry.getObservations()) { double deltaHydro = Math.abs(peptideEntry.getMedianObservedHydrophobicity() - obs.getObservedHydrophobicity()); if (deltaHydro > deltaHydroCutoff) { resultList.add(obs); //for debug logging outliersPerRun[amtDB.getSequenceForRun(obs.getRunEntry()) - 1]++; peptideEntry.removeObservation(obs); } } } //for debug logging if (_log.isDebugEnabled()) { int totalOutliers = 0; for (int i=0; i<amtDB.numRuns(); i++) { _log.debug("Run " + (i+1) + ": " + outliersPerRun[i] + " outliers"); totalOutliers += outliersPerRun[i]; } _log.debug("total outliers: " + totalOutliers); } return resultList.toArray(new AmtPeptideEntry.AmtPeptideObservation[0]); } /** * This is a utility method to determine minimum and maximum masses for matching individual * features by mass. Returns min and then max masses, in an array. * @param features * @param massMatchDeltaMass * @param massMatchDeltaMassType * @return */ public static float[][] findMinAndMaxMassesForMatch(Feature[] features, float massMatchDeltaMass, int massMatchDeltaMassType) { float[] minMassesForMatch = new float[features.length]; float[] maxMassesForMatch = new float[features.length]; for (int j=0; j<features.length; j++) { float ms1FeatureMass = features[j].getMass(); minMassesForMatch[j] = ms1FeatureMass - massMatchDeltaMass; maxMassesForMatch[j] = ms1FeatureMass + massMatchDeltaMass; if (massMatchDeltaMassType == FeatureSetMatcher.DELTA_MASS_TYPE_PPM) { float absDeltaMass = (massMatchDeltaMass * minMassesForMatch[j] / 1000000f); minMassesForMatch[j] = ms1FeatureMass - absDeltaMass; maxMassesForMatch[j] = ms1FeatureMass + absDeltaMass; } } return new float[][] { minMassesForMatch,maxMassesForMatch }; } /** * Return a database containing only entries from the subset of runs * from the passed-in AMT database that match, by mass only, at least * minMassMatchPercent of their entries to ms1Features. * * Add the best runs first, and stop if db gets too big * @param amtDatabase * @param ms1FeaturesOrig * @param minMassMatchPercent * @param massMatchDeltaMass * @param massMatchDeltaMassType * @param maxNewDBEntries * @param modificationsInMS1 * @param showCharts * @return */ public static AmtDatabase removeRunsWithoutMassMatches(AmtDatabase amtDatabase, Feature[] ms1FeaturesOrig, int minMassMatchPercent, float massMatchDeltaMass, int massMatchDeltaMassType, int maxNewDBEntries, int maxNewDBRuns, MS2Modification[] modificationsInMS1, boolean showCharts) { Feature[] ms1FeaturesCopy = new Feature[ms1FeaturesOrig.length]; System.arraycopy(ms1FeaturesOrig, 0, ms1FeaturesCopy, 0, ms1FeaturesCopy.length); Arrays.sort(ms1FeaturesCopy, new Feature.MassAscComparator()); AmtDatabaseMatcher amtDatabaseMatcher = new AmtDatabaseMatcher(); AmtDatabase result = new AmtDatabase(); float[] percentMatchedPerRun = new float[amtDatabase.numRuns()]; AmtRunEntry[] runEntries = amtDatabase.getRuns(); float[][] minAndMaxMassesForMatch = findMinAndMaxMassesForMatch(ms1FeaturesCopy, massMatchDeltaMass, massMatchDeltaMassType); float[] minMassesForMatch = minAndMaxMassesForMatch[0]; float[] maxMassesForMatch = minAndMaxMassesForMatch[1]; final Map<AmtRunEntry, Float> runMassMatchPercentMap = new HashMap<AmtRunEntry, Float>(); int numPassing = 0; for (int i=0; i<runEntries.length; i++) { if (i % (Math.max(runEntries.length/10,1)) == 0) { ApplicationContext.setMessage(Rounder.round(((double) i * 100.0 / (double) runEntries.length),0) + "% done evaluating runs; passing: " + numPassing + " / " + i + " (" + Rounder.round(((double) numPassing * 100.0) / (double) i, 0) + "%)..."); } AmtRunEntry runEntry = runEntries[i]; AmtDatabaseFeatureSetGenerator featureGen = new AmtDatabaseFeatureSetGenerator(amtDatabase); FeatureSet runFeatureSet = featureGen.createFeatureSetForRun(runEntry, modificationsInMS1); Feature[] runFeatures = runFeatureSet.getFeatures(); Arrays.sort(runFeatures, new Feature.MassAscComparator()); int numMassMatchedFeatures = 0; int ms1FeaturesIndex = 0; int runFeaturesIndex = 0; while (runFeaturesIndex < runFeatures.length && ms1FeaturesIndex < ms1FeaturesCopy.length) { while (runFeaturesIndex < runFeatures.length && runFeatures[runFeaturesIndex].getMass() <= minMassesForMatch[ms1FeaturesIndex]) runFeaturesIndex++; if (runFeaturesIndex >= runFeatures.length) break; if (runFeatures[runFeaturesIndex].getMass() <= maxMassesForMatch[ms1FeaturesIndex]) numMassMatchedFeatures++; ms1FeaturesIndex++; } float massMatchesPercent = (((float) numMassMatchedFeatures) / ((float) runFeatureSet.getFeatures().length) * 100); // System.err.println("Matches this run: " + numMassMatchedFeatures + " (" + massMatchesPercent + "%)"); percentMatchedPerRun[i] = massMatchesPercent; runMassMatchPercentMap.put(runEntry, massMatchesPercent); if (massMatchesPercent >= minMassMatchPercent) numPassing++; } if (numPassing == 0) { ApplicationContext.infoMessage("No runs matched enough MS1 features. Quitting"); return result; } AmtRunEntry[] runEntriesSorted = new AmtRunEntry[runEntries.length]; for (int i=0; i<runEntries.length; i++) runEntriesSorted[i] = runEntries[i]; Comparator<AmtRunEntry> runComparatorByMassMatchPercentDesc = new Comparator<AmtRunEntry> () { public int compare(AmtRunEntry o1, AmtRunEntry o2) { float o1Matches = runMassMatchPercentMap.get(o1); float o2Matches = runMassMatchPercentMap.get(o2); return o1Matches < o2Matches ? 1 : o1Matches > o2Matches ? -1 : 0; } }; Arrays.sort(runEntriesSorted, runComparatorByMassMatchPercentDesc); ApplicationContext.infoMessage("Runs passing: " + numPassing + ". Adding runs to database..."); for (AmtRunEntry runEntry : runEntriesSorted) { float massMatchesPercent = runMassMatchPercentMap.get(runEntry); //since runs are sorted, stop if this run doesn't pass if (massMatchesPercent < minMassMatchPercent) break; int numPeptidesIfRunAdded = result.numEntries(); for (AmtPeptideEntry peptideEntry : amtDatabase.getPeptideEntriesForRun(runEntry)) if (!result.contains(peptideEntry.getPeptideSequence())) numPeptidesIfRunAdded++; if (numPeptidesIfRunAdded > maxNewDBEntries) { ApplicationContext.setMessage("Too many entries in passing runs, stopping early to keep DB small"); ApplicationContext.setMessage("Adding " + result.numRuns() + " runs, out of " + numPassing + " mass-matched"); break; } Map<MS2Modification, MS2Modification> modificationMap = result.addRunEntry(runEntry); for (AmtPeptideEntry peptideEntry : amtDatabase.getPeptideEntriesForRun(runEntry)) { AmtPeptideEntry newEntry = null; // newEntry.setPeptideSequence(peptideEntry.getPeptideSequence()); // newEntry.setPredictedHydrophobicity(AmtUtilities.calculateNormalizedHydrophobicity(peptideEntry.getPeptideSequence())); for (AmtPeptideEntry.AmtPeptideModificationStateEntry modState : peptideEntry.getModificationStateEntries()) { AmtPeptideEntry.AmtPeptideObservation obsForRun = modState.getObservationForRun(runEntry); if (obsForRun != null) { if (newEntry == null) { newEntry = AmtPeptideEntry.createEntryFromObservation(peptideEntry.getPeptideSequence(), modState.getModifications(), obsForRun); } else newEntry.addObservation(peptideEntry.getPeptideSequence(), modState.getModifications(), obsForRun); } } if (newEntry != null) result.addObservationsFromEntry(newEntry, modificationMap); } _log.debug("Adding run " + runEntry.getPepXmlFilename() + ", mass matches: " + massMatchesPercent + "%"); if (result.numRuns() >= maxNewDBRuns) { ApplicationContext.setMessage("Too many passing runs, stopping early to keep DB small"); ApplicationContext.setMessage("Adding " + result.numRuns() + " runs, out of " + numPassing + " mass-matched"); break; } } if (result.numRuns() == 0) { ApplicationContext.infoMessage("Too many entries in a single run! Quitting"); return result; } ApplicationContext.infoMessage(result.numRuns() + " out of " + amtDatabase.numRuns() + " runs kept"); ApplicationContext.infoMessage("Database summary: " + result); return result; } public static AmtDatabase removeRunsByStructure(AmtDatabase amtDatabase, int ms1RunCol, int ms1RunRow, int ms1RunExp, int maxNewDBEntries, int maxNewDBRuns, Fractionation2DUtilities.FractionatedAMTDatabaseStructure amtDatabaseStructure, boolean showCharts) { final Map<AmtRunEntry, Float> runDistanceMap = new HashMap<AmtRunEntry, Float>(); AmtRunEntry[] runEntries = amtDatabase.getRuns(); for (int i=0; i<runEntries.length; i++) { Pair<Integer, int[]> expAndPos = amtDatabaseStructure.calculateExperimentAndPosition(i); int runEntryCol = expAndPos.second[0]; int runEntryRow = expAndPos.second[1]; //manhattan distance float distance = Math.abs(runEntryCol - ms1RunCol) + Math.abs(runEntryRow - ms1RunRow); runDistanceMap.put(runEntries[i], distance); } AmtRunEntry[] runEntriesSorted = new AmtRunEntry[runEntries.length]; for (int i=0; i<runEntriesSorted.length; i++) runEntriesSorted[i] = runEntries[i]; Comparator<AmtRunEntry> runComparatorByDistanceAsc = new Comparator<AmtRunEntry> () { public int compare(AmtRunEntry o1, AmtRunEntry o2) { float o1Matches = runDistanceMap.get(o1); float o2Matches = runDistanceMap.get(o2); return o1Matches < o2Matches ? -1 : o1Matches > o2Matches ? 1 : 0; } }; Arrays.sort(runEntriesSorted, runComparatorByDistanceAsc); return removeRunsInOrder(amtDatabase, maxNewDBEntries, maxNewDBRuns, amtDatabaseStructure, runEntriesSorted, showCharts, null, null, null, null); } protected static Set<String> createPeptideSetFromFeatures(Feature[] features) { Set<String> peptides = new HashSet<String>(); for (Feature feature : features) { String peptide = MS2ExtraInfoDef.getFirstPeptide(feature); if (peptide != null) peptides.add(peptide); } return peptides; } /** * Remove runs with insufficient peptide matches to the MS/MS features passed in * @param amtDatabase * @param ms2Features * @param minPeptideMatchPercent * @param maxNewDBEntries * @param maxNewDBRuns * @param showCharts * @return */ public static AmtDatabase removeRunsWithoutPeptideMatches(AmtDatabase amtDatabase, Feature[] ms2Features, int minPeptideMatchPercent, int maxNewDBEntries, int maxNewDBRuns, Fractionation2DUtilities.FractionatedAMTDatabaseStructure amtDatabaseStructure, boolean showCharts, Feature[] ms1Features, MS2Modification[] ms2ModificationsForMatching, AmtDatabaseMatcher dbMatcher) { float[] percentMatchedPerRun = new float[amtDatabase.numRuns()]; AmtRunEntry[] runEntries = amtDatabase.getRuns(); final Map<AmtRunEntry, Integer> runPeptideMatchNumberMap = new HashMap<AmtRunEntry, Integer>(); final Map<AmtRunEntry, Float> runPeptideMatchPercentMap = new HashMap<AmtRunEntry, Float>(); Set<String> ms2Peptides = createPeptideSetFromFeatures(ms2Features); int numPassing = 0; for (int i=0; i<runEntries.length; i++) { if (i % (Math.max(runEntries.length/10,1)) == 0) { ApplicationContext.setMessage(Rounder.round(((double) i * 100.0 / (double) runEntries.length),0) + "% done evaluating runs; passing: " + numPassing + " / " + i + " (" + Rounder.round(((double) numPassing * 100.0) / (double) i, 0) + "%)..."); } AmtRunEntry runEntry = runEntries[i]; int numPeptidesThisRun = 0; int numPeptidesInCommon = 0; for (AmtPeptideEntry peptideEntry : amtDatabase.getEntries()) { String peptideSequence = peptideEntry.getPeptideSequence(); AmtPeptideEntry.AmtPeptideObservation obs = peptideEntry.getObservationForRun(runEntry); if (obs != null) { numPeptidesThisRun++; if (ms2Peptides.contains(peptideSequence)) numPeptidesInCommon++; } } float peptideMatchesPercent = (((float) numPeptidesInCommon) / ((float) numPeptidesThisRun) * 100); // System.err.println("Matches this run: " + numMassMatchedFeatures + " (" + massMatchesPercent + "%)"); percentMatchedPerRun[i] = peptideMatchesPercent; runPeptideMatchPercentMap.put(runEntry, peptideMatchesPercent); runPeptideMatchNumberMap.put(runEntry, numPeptidesInCommon); if (peptideMatchesPercent >= minPeptideMatchPercent) numPassing++; } if (numPassing == 0) { ApplicationContext.infoMessage("No runs matched enough MS1 features. Quitting"); return new AmtDatabase(); } if (showCharts && amtDatabaseStructure != null) { double[] peptideMatchesPerRun = new double[amtDatabase.numRuns()]; for (int i=0; i<peptideMatchesPerRun.length; i++) peptideMatchesPerRun[i] = runPeptideMatchNumberMap.get(amtDatabase.getRunBySequence(i+1)); Fractionation2DUtilities.showHeatMapChart( amtDatabaseStructure, peptideMatchesPerRun, "AMT Database Run Peptides In Common with This Run", true); double[] peptidePercentMatchesPerRun = new double[amtDatabase.numRuns()]; for (int i=0; i<peptidePercentMatchesPerRun.length; i++) peptidePercentMatchesPerRun[i] = runPeptideMatchPercentMap.get(amtDatabase.getRunBySequence(i+1)); Fractionation2DUtilities.showHeatMapChart( amtDatabaseStructure, peptidePercentMatchesPerRun, "AMT Database Run Peptide % In Common with This Run", true); } List<AmtRunEntry> passingRunEntriesList = new ArrayList<AmtRunEntry>(runEntries.length); for (AmtRunEntry runEntry : runEntries) if (runPeptideMatchPercentMap.get(runEntry) >= minPeptideMatchPercent) passingRunEntriesList.add(runEntry); AmtRunEntry[] runEntriesSorted = new AmtRunEntry[passingRunEntriesList.size()]; for (int i=0; i<runEntriesSorted.length; i++) runEntriesSorted[i] = passingRunEntriesList.get(i); Comparator<AmtRunEntry> runComparatorByPeptideMatchPercentDesc = new Comparator<AmtRunEntry> () { public int compare(AmtRunEntry o1, AmtRunEntry o2) { float o1Matches = runPeptideMatchPercentMap.get(o1); float o2Matches = runPeptideMatchPercentMap.get(o2); return o1Matches < o2Matches ? 1 : o1Matches > o2Matches ? -1 : 0; } }; Comparator<AmtRunEntry> runComparatorByPeptideMatchNumberDesc = new Comparator<AmtRunEntry> () { public int compare(AmtRunEntry o1, AmtRunEntry o2) { float o1Matches = runPeptideMatchNumberMap.get(o1); float o2Matches = runPeptideMatchNumberMap.get(o2); return o1Matches < o2Matches ? 1 : o1Matches > o2Matches ? -1 : 0; } }; Arrays.sort(runEntriesSorted, runComparatorByPeptideMatchPercentDesc); ApplicationContext.infoMessage("Runs passing: " + numPassing + ". Adding runs to database..."); return removeRunsInOrder(amtDatabase, maxNewDBEntries, maxNewDBRuns, amtDatabaseStructure, runEntriesSorted, showCharts, ms1Features, ms2ModificationsForMatching, ms2Features, dbMatcher); } /** * Trims down an AMT database by adding runs one by one from runEntriesSorted, stopping when * adding another run would violate maxNewDBEntries or maxNewDBRuns. * @param amtDatabase * @param maxNewDBEntries * @param maxNewDBRuns * @param amtDatabaseStructure * @param runEntriesSorted sorted in _descending_ order of goodness * @param showCharts * @return */ public static AmtDatabase removeRunsInOrder(AmtDatabase amtDatabase, int maxNewDBEntries, int maxNewDBRuns, Fractionation2DUtilities.FractionatedAMTDatabaseStructure amtDatabaseStructure, AmtRunEntry[] runEntriesSorted, boolean showCharts, Feature[] ms1Features, MS2Modification[] ms2ModificationsForMatching, Feature[] ms2Features, AmtDatabaseMatcher databaseMatcher) { //if true, we will engage in an extremely costly plot of FAR vs. number of runs kept in the database. //This was done in order to produce that plot for the HUPO conference. Not normally useful, as //it's very hand-wavy boolean plotFarVsNumRuns = false; AmtDatabase result = new AmtDatabase(); int numSortedRuns = runEntriesSorted.length; //variables for db density vs. FAR chart List<Double> farsWithNumbersOfRuns = new ArrayList<Double>(); List<Double> numEntriesWithNumbersOfRuns = new ArrayList<Double>(); List<AmtRunEntry> runsAdded = new ArrayList<AmtRunEntry>(); for (AmtRunEntry runEntry : runEntriesSorted) { int numPeptidesIfRunAdded = result.numEntries(); for (AmtPeptideEntry peptideEntry : amtDatabase.getPeptideEntriesForRun(runEntry)) if (!result.contains(peptideEntry.getPeptideSequence())) numPeptidesIfRunAdded++; if (numPeptidesIfRunAdded > maxNewDBEntries) { ApplicationContext.setMessage("Too many entries in passing runs, stopping early to keep DB small"); ApplicationContext.setMessage("Adding " + result.numRuns() + " runs, out of " + numSortedRuns); break; } if (result.numRuns() + 1 >= maxNewDBRuns) { ApplicationContext.setMessage("Too many passing runs, stopping early to keep DB small"); ApplicationContext.setMessage("Adding " + result.numRuns() + " runs, out of " + numSortedRuns); break; } Map<MS2Modification, MS2Modification> modificationMap = result.addRunEntry(runEntry); for (AmtPeptideEntry peptideEntry : amtDatabase.getPeptideEntriesForRun(runEntry)) { AmtPeptideEntry newEntry = null; for (AmtPeptideEntry.AmtPeptideModificationStateEntry modState : peptideEntry.getModificationStateEntries()) { AmtPeptideEntry.AmtPeptideObservation obsForRun = modState.getObservationForRun(runEntry); if (obsForRun != null) { if (newEntry == null) { newEntry = AmtPeptideEntry.createEntryFromObservation(peptideEntry.getPeptideSequence(), modState.getModifications(), obsForRun); } else newEntry.addObservation(peptideEntry.getPeptideSequence(), modState.getModifications(), obsForRun); } } if (newEntry != null) result.addObservationsFromEntry(newEntry, modificationMap); } runsAdded.add(runEntry); _log.debug("Adding run " + runEntry.getPepXmlFilename() + " (#" + result.numRuns() + ")"); } if (result.numRuns() == 0) { ApplicationContext.infoMessage("Too many entries in a single run! Quitting"); return result; } if (showCharts && amtDatabaseStructure != null) { double[] matchedRuns = new double[amtDatabase.numRuns()]; for (AmtRunEntry runEntry : amtDatabase.getRuns()) { if (runsAdded.contains(runEntry)) { matchedRuns[amtDatabase.getSequenceForRun(runEntry)-1]++; } } Fractionation2DUtilities.showHeatMapChart( amtDatabaseStructure, matchedRuns, "AMT Runs Kept For Matching", false); } //Only for creating an extremely costly plot if (showCharts && ms1Features != null && ms2ModificationsForMatching != null && plotFarVsNumRuns) { List<Double> numbersOfRuns = new ArrayList<Double>(); for (int i=0; i<farsWithNumbersOfRuns.size(); i++) numbersOfRuns.add((double)(i+1)); System.err.println("Line chart. Size: " + farsWithNumbersOfRuns); PanelWithLineChart pwlc = new PanelWithLineChart(numbersOfRuns, farsWithNumbersOfRuns, "FAR vs. Number of Fractions"); ChartDialog cd = new ChartDialog(pwlc); cd.setTitle("Line chart: FAR vs. number of fractions"); cd.setVisible(true); PanelWithLineChart pwlc2 = new PanelWithLineChart(numEntriesWithNumbersOfRuns, farsWithNumbersOfRuns, "FAR vs. Number of Entries"); ChartDialog cd2 = new ChartDialog(pwlc2); cd2.setTitle("Line chart: FAR vs. number of entries"); cd2.setVisible(true); } ApplicationContext.infoMessage(result.numRuns() + " out of " + amtDatabase.numRuns() + " runs kept"); ApplicationContext.infoMessage("Database summary: " + result); return result; } }