/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.viewer.amt; import java.util.*; import java.io.*; import org.fhcrc.cpl.toolbox.proteomics.feature.Feature; import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet; import org.fhcrc.cpl.toolbox.proteomics.feature.matching.FeatureSetMatcher; import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef; import org.fhcrc.cpl.toolbox.proteomics.Protein; import org.fhcrc.cpl.toolbox.proteomics.PeptideGenerator; import org.fhcrc.cpl.toolbox.proteomics.Peptide; import org.fhcrc.cpl.toolbox.proteomics.MS2Modification; import org.apache.log4j.Logger; public class ProteinMatcher { private static Logger _log = Logger.getLogger(ProteinMatcher.class); //minimum peptide length we want to try to match protected static final int MIN_PEPTIDE_LENGTH=5; //maximum number of missed cleavages to account for when matching protected static final int MAX_MISSED_CLEAVAGES=1; /** * This is a bit complicated. The point is to take an MS1 feature set that has already been matched * with MS2 (or with an AMT database), and mine it for more matches, against all the proteins that * we think might be there. * * Why might we think proteins are there? Well, they might be matched in the MS1 file itself. Or * they might be in the embedded MS2 (ms2Features). Or they might be everything in the AMT database, * which presumably represents related runs. * @param matchedMs1Features * @param ms2Features * @param amtDatabase * @param allProteins * @param minPeptideMass * @param maxPeptideMass * @param featureSetMatcher * @param modifications * @return */ public static Map<Protein,List<Feature>> matchAdditionalPeptidesForMatchedProteins( Feature[] matchedMs1Features, Feature[] ms2Features, AmtDatabase amtDatabase, Protein[] allProteins, float minPeptideMass, float maxPeptideMass, FeatureSetMatcher featureSetMatcher, MS2Modification[] modifications) { List<Feature> ms1FeaturesWithoutMatches = new ArrayList<Feature>(); Set<String> initiallyMatchedPeptides = new HashSet<String>(); for (Feature matchedMs1Feature : matchedMs1Features) { String peptide = MS2ExtraInfoDef.getFirstPeptide(matchedMs1Feature); if (peptide == null) ms1FeaturesWithoutMatches.add(matchedMs1Feature); else initiallyMatchedPeptides.add(peptide); } if (ms2Features != null) { for (Feature ms2Feature : ms2Features) { String peptide = MS2ExtraInfoDef.getFirstPeptide(ms2Feature); if (peptide == null) ms1FeaturesWithoutMatches.add(ms2Feature); else initiallyMatchedPeptides.add(peptide); } } if (amtDatabase != null) for (AmtPeptideEntry peptideEntry : amtDatabase.getEntries()) initiallyMatchedPeptides.add(peptideEntry.getPeptideSequence()); Set<Protein> matchedProteins = findProteinsForPeptides(initiallyMatchedPeptides, allProteins); _log.debug("matched proteins: " + matchedProteins.size()); Map<Protein,List<Feature>> proteinNewMatchMap = new HashMap<Protein,List<Feature>>(); Map<Feature, Protein> peptideFeatureProteinMap = new HashMap<Feature, Protein>(); for (Protein matchedProtein : matchedProteins) { List<Feature> unmatchedPeptideFeatures = createFeaturesForProteinPeptides(matchedProtein, initiallyMatchedPeptides, minPeptideMass, maxPeptideMass, modifications); for (Feature feature : unmatchedPeptideFeatures) { if (!initiallyMatchedPeptides.contains(MS2ExtraInfoDef.getFirstPeptide(feature))) peptideFeatureProteinMap.put(feature, matchedProtein); } // _log.debug("protein: created peptide features: " + unmatchedPeptideFeatures.size()); } _log.debug("Created " + peptideFeatureProteinMap.size() + " features representing unmatched peptides"); FeatureSet unmatchedPeptideFeatureSet = new FeatureSet(peptideFeatureProteinMap.keySet().toArray(new Feature[peptideFeatureProteinMap.size()])); _log.debug("Matching against " + ms1FeaturesWithoutMatches.size() + " unmatched MS1 features"); FeatureSet initiallyUnmatchedFeatureSet = new FeatureSet(ms1FeaturesWithoutMatches.toArray(new Feature[ms1FeaturesWithoutMatches.size()])); FeatureSetMatcher.FeatureMatchingResult matchingResult = featureSetMatcher.matchFeatures(initiallyUnmatchedFeatureSet, unmatchedPeptideFeatureSet); for (Feature feature : matchingResult.getMasterSetFeatures()) { Feature bestPeptideFeatureMatch = matchingResult.getBestMatch(feature); Protein sourceProtein = peptideFeatureProteinMap.get(bestPeptideFeatureMatch); //System.err.println("protein: " + sourceProtein.getName()); List<Feature> thisProteinMatchedFeatures = proteinNewMatchMap.get(sourceProtein); if (thisProteinMatchedFeatures == null) { thisProteinMatchedFeatures = new ArrayList<Feature>(); proteinNewMatchMap.put(sourceProtein, thisProteinMatchedFeatures); } MS2ExtraInfoDef.setSinglePeptide(feature, MS2ExtraInfoDef.getFirstPeptide(bestPeptideFeatureMatch)); thisProteinMatchedFeatures.add(feature); } _log.debug("Matched " + proteinNewMatchMap.size() + " proteins out of " + matchedProteins.size() + " initially matched"); _log.debug("Peptides matched: " + matchingResult.getMasterSetFeatures().size()); return proteinNewMatchMap; } public static Set<Protein> findProteinsForPeptides(Set<String> peptides, Protein[] proteins) { Set<Protein> result = new HashSet<Protein>(); for (Protein protein : proteins) { String proteinSequence = protein.getSequenceAsString(); for (String peptide : peptides) { if (proteinSequence.contains(peptide)) result.add(protein); } } return result; } /** * TODO: modifications * @param protein * @param peptidesToExclude * @param minPeptideMass * @param maxPeptideMass * @return */ public static List<Feature> createFeaturesForProteinPeptides(Protein protein, Set<String> peptidesToExclude, float minPeptideMass, float maxPeptideMass, MS2Modification[] modifications) { List<Feature> result = new ArrayList<Feature>(); List<Peptide> proteinPeptides = generatePeptidesFromProtein(protein, 0, MIN_PEPTIDE_LENGTH); // _log.debug("createFeatures: peptides: " + proteinPeptides.size()); for (Peptide peptide : proteinPeptides) { if (peptide.getMonoisotopicMass() >= minPeptideMass && peptide.getMonoisotopicMass() <= maxPeptideMass && !peptidesToExclude.contains(peptide.getChars())) { AmtDatabaseFeatureSetGenerator featureGen = new AmtDatabaseFeatureSetGenerator(null); List<MS2Modification> varModList = new ArrayList<MS2Modification>(); List<MS2Modification> staticModList = new ArrayList<MS2Modification>(); if (modifications != null) for (MS2Modification modification : modifications) { if (modification.getVariable()) varModList.add(modification); else staticModList.add(modification); } List<Feature> peptideFeatures = null; // featureGen.generateModFeaturesForPeptide( // new String(peptide.getChars()), // HydrophobicityNormalizer.normalize(peptide.getHydrophobicity3(),"krokhin",3), // staticModList, // varModList); for (Feature feature : peptideFeatures) result.add(feature); // Feature peptideFeature = new Feature(); // peptideFeature.setScan(1); // peptideFeature.setPeaks(1); // AmtExtraInfoDef.setObservedHydrophobicity(peptideFeature, // HydrophobicityNormalizer.normalize(peptide.getHydrophobicity3(),"krokhin",3)); // MS2ExtraInfoDef.setSinglePeptide(peptideFeature, new String(peptide.getChars())); // peptideFeature.setCharge(1); // peptideFeature.setMass((float) peptide.getMonoisotopicMass()); // peptideFeature.updateMz(); ////System.err.println("Adding feature with mass " + peptideFeature.getMass() + ", hydro " + AmtExtraInfoDef.getObservedHydrophobicity(peptideFeature)); // // result.add(peptideFeature); } } return result; } /** * Returns a HashMap containing, for each Protein, all the features that match peptides found * by tryptic digestion. Each feature in the FeatureSet will be marked with its peptide * @param ms1Features * @param proteins * @param minProteinMatchFeatures minimum number of features to match in order for * protein to be included in results * @return */ public static HashMap<Protein, Map<String, Feature>> findMatchesForAllProteins(FeatureSet ms1Features, Protein[] proteins, MS2Modification[] modifications, AmtDatabase amtDatabase, int minProteinMatchFeatures) { Feature[] ms1FeatureArray = ms1Features.getFeatures().clone(); //make sure none of the features are excluded, to start with for (Feature feature : ms1FeatureArray) { feature.excluded = false; } Feature.MassAscComparator comp = new Feature.MassAscComparator(); Arrays.sort(ms1FeatureArray, comp); // double[] lowMassTable = PeptideMatcher.generateMassTableWithStaticMods( // PeptideGenerator.AMINO_ACID_MONOISOTOPIC_MASSES, // modifications); HashMap<Protein, Map<String, Feature>> result = new HashMap<Protein, Map<String, Feature>>(); for (int i=0; i<proteins.length; i++) { Protein currentProtein = proteins[i]; //status message if (i%(proteins.length/10) == 0) _log.debug("Processed " + i + " out of " + proteins.length + " proteins"); Map<String, Feature> foundFeatures = findFeaturesForProtein(ms1FeatureArray, currentProtein, modifications, amtDatabase); if (foundFeatures.size() >= minProteinMatchFeatures) result.put(currentProtein, foundFeatures); } return result; } /** * Runs digestProtein to break stuff up, setting the digest type, cleavages and peptide * length appropriately * @param protein * @return */ public static ArrayList<Peptide> generatePeptidesFromProtein(Protein protein, int maxMissedCleavages, int minPeptideLength) { PeptideGenerator peptideGenerator = new PeptideGenerator(); //use tryptic digest peptideGenerator.setDigest(PeptideGenerator.DIGEST_TRYPTIC); //allow one missed cleavage peptideGenerator.setMaxMissedCleavages(maxMissedCleavages); //set minimum residues peptideGenerator.setMinResidues(minPeptideLength); Peptide[] peptidesFromProtein = peptideGenerator.digestProtein(protein); ArrayList<Peptide> peptidesFromProteinList = new ArrayList<Peptide>(); for (Peptide peptide : peptidesFromProtein) peptidesFromProteinList.add(peptide); return peptidesFromProteinList; } /** * Runs digestProtein to break stuff up, setting the digest type, cleavages and peptide * length appropriately * @param protein * @return */ public static ArrayList<Peptide> generatePeptidesFromProtein(Protein protein) { return generatePeptidesFromProtein(protein, MAX_MISSED_CLEAVAGES, MIN_PEPTIDE_LENGTH); } /** * returns an ArrayList of features, the matches between the ms1Features and the peptides * in the protein that are also in the amt database. First copies the database and pares out * the entries that don't exist as peptides in this protein, then matches the remaining database * against the MS1 features * @param ms1Features * @param protein * @return */ public static Map<String,Feature> findFeaturesForProtein(Feature[] ms1Features, Protein protein, MS2Modification[] modifications, AmtDatabase amtDatabase) { ArrayList<Peptide> peptidesFromProtein = generatePeptidesFromProtein(protein); //System.err.println("protein peptides: " + peptidesFromProtein.size()); Set<String> peptideSequencesInAmtFromProtein = new HashSet<String>(); for (Peptide peptide : peptidesFromProtein) { String peptideSequence = new String(peptide.getChars()); if (amtDatabase.contains(peptideSequence)) { peptideSequencesInAmtFromProtein.add(peptideSequence); } //for (String amtPeptide : amtDatabase.getPeptides()) //{ // if (peptideSequence.equals(amtPeptide)) System.err.println("A MATCH!!!! entry?" + amtDatabase.getEntry(peptideSequence) + ",***contains: " + amtDatabase.contains(peptideSequence)); //} } //if the protein and the db don't even overlap in peptide IDs, give up if (peptideSequencesInAmtFromProtein.isEmpty()) { _log.debug("No AMT database entries with peptides contained in this protein"); return new HashMap<String, Feature>(); } // AmtDatabase amtDatabaseCopy = (AmtDatabase) amtDatabase.waistDeepCopy(); // for (AmtPeptideEntry peptideEntry : amtDatabaseCopy.getEntries()) // { // if (!peptideSequencesInAmtFromProtein.contains(peptideEntry.getPeptideSequence())) // { // amtDatabaseCopy.removeEntry(peptideEntry.getPeptideSequence()); // } // } AmtDatabase amtDatabaseCopy = new AmtDatabase(); for (AmtRunEntry runEntry : amtDatabase.getRuns()) amtDatabaseCopy.addRunEntry(runEntry); for (AmtPeptideEntry peptideEntry : amtDatabase.getEntries()) { if (peptideSequencesInAmtFromProtein.contains(peptideEntry.getPeptideSequence())) amtDatabaseCopy.addOrOverrideEntry(peptideEntry); } //TODO: some way to do something other than the defaults //TODO: use alignment AmtDatabaseMatcher amtDatabaseMatcher = new AmtDatabaseMatcher(); Map<Feature, List<AmtPeptideEntry>> featureEntryListMap = null; // amtDatabaseMatcher.matchAmtDatabaseWithMS1Features(amtDatabaseCopy, ms1Features, // modifications, null, 1, false, false); //TODO: this is so suspect, it's not even funny Map<String, Feature> result = new HashMap<String, Feature>(); for (Feature feature : featureEntryListMap.keySet()) { List<AmtPeptideEntry> entryList = featureEntryListMap.get(feature); result.put(entryList.get(0).getPeptideSequence(), feature); } return result; } //methods for comparison of protein/peptide files /** * Just handles file IO * @param file * @return */ public static BufferedReader openProteinPeptideFile(File file) { BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(file)); } catch (Exception e) { e.printStackTrace(System.err); } return reader; } /** * skips past any peptide lines to the next protein, in my funky file format * @param reader * @return */ public static Protein getNextProtein(BufferedReader reader) { Protein result = null; try{ String line; while ((line = reader.readLine()) != null) { if (line.length() > 0 && line.charAt(0) == '>') { String proteinHeader = line.substring(1); //process next line as protein sequence ByteArrayOutputStream aaStream = new ByteArrayOutputStream(2048); line = reader.readLine(); byte[] bytes = line.getBytes(); for (int i = 0; i < bytes.length; i++) if ((bytes[i] >= 'A') && (bytes[i] <= 'Z')) { //_aaCounts[bytes[i] - 'A'] ++; aaStream.write(bytes[i]); } result = new Protein(proteinHeader, aaStream.toByteArray()); //System.err.println("Writing protein " + currentProtein.getSequenceAsString()); break; } } } catch (Exception e) { e.printStackTrace(System.err); } return result; } /** * In my funky file format, gobbles a list of peptides, stopping when it hits eof or * a protein definition * @param reader * @return */ public static ArrayList<String> getPeptides(BufferedReader reader) { ArrayList<String> result = new ArrayList<String>(); try { String line; reader.mark(2000); while ((line = reader.readLine()) != null) { if (line.length() > 0 && line.charAt(0) == '>') { //set the mark back to the last protein reader.reset(); break; } else { result.add(line); reader.mark(2000); } } } catch (Exception e) { //TODO: better handling e.printStackTrace(System.err); } return result; } public static int compareProteinsBySequence(Protein prot1, Protein prot2) { return new Protein.SequenceComparator().compare(prot1, prot2); } }