/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.toolbox.proteomics; import org.fhcrc.cpl.toolbox.proteomics.feature.Feature; import org.fhcrc.cpl.toolbox.proteomics.feature.FeaturePepXmlWriter; import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet; import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef; import org.fhcrc.cpl.toolbox.gui.chart.PanelWithChart; import org.fhcrc.cpl.toolbox.gui.chart.PanelWithLineChart; import org.fhcrc.cpl.toolbox.ApplicationContext; import org.fhcrc.cpl.toolbox.filehandler.SimpleXMLStreamReader; import org.fhcrc.cpl.toolbox.Rounder; import org.fhcrc.cpl.toolbox.datastructure.Pair; import org.fhcrc.cpl.toolbox.commandline.CommandLineModuleExecutionException; import org.fhcrc.cpl.toolbox.proteomics.filehandler.ProtXmlReader; import org.fhcrc.cpl.toolbox.proteomics.filehandler.ProteinGroup; import org.fhcrc.cpl.toolbox.proteomics.filehandler.FastaLoader; import org.apache.log4j.Logger; import javax.xml.stream.XMLStreamException; import java.io.*; import java.util.*; import java.util.List; import java.awt.*; /** * This class doesn't do parsing of ProtXML files itself. It uses ProtXmlReader for that. * * This is for utilities to work with the output of ProtXmlReader */ public class ProteinUtilities { protected static Logger _log = Logger.getLogger(ProteinUtilities.class); public static Map<String, Set<String>> loadPeptideProteinMapFromProtXML(File protXmlFile, double minProteinProphet) throws FileNotFoundException, XMLStreamException { return loadPeptideProteinMapFromProtXML(protXmlFile, minProteinProphet, false); } /** * Create a mapping between all peptides noted in the protXml file, and all proteins * that they are associated with. This means all indistinguishable proteins * @param protXmlFile * @return * @throws org.fhcrc.cpl.toolbox.commandline.CommandLineModuleExecutionException */ public static Map<String, Set<String>> loadPeptideProteinMapFromProtXML(File protXmlFile, double minProteinProphet, boolean quantProteinsOnly) throws FileNotFoundException, XMLStreamException { ProtXmlReader protXmlReader = new ProtXmlReader(protXmlFile); //build a map from each peptide to all proteins it supports Map<String, Set<String>> peptideProteinMap = new HashMap<String, Set<String>>(); Iterator<ProteinGroup> iterator = protXmlReader.iterator(); while (iterator.hasNext()) { ProteinGroup group = iterator.next(); if (minProteinProphet > 0 && group.getGroupProbability() < minProteinProphet) continue; for (ProtXmlReader.Protein protXmlReaderProtein : group.getProteins()) { if (quantProteinsOnly && protXmlReaderProtein.getQuantitationRatio() == null) continue; Set<String> proteinNames = new HashSet<String>(); //add first protein proteinNames.add(protXmlReaderProtein.getProteinName()); //add indistinguishable proteins proteinNames.addAll(protXmlReaderProtein.getIndistinguishableProteinNames()); for (ProtXmlReader.Peptide peptide : protXmlReaderProtein.getPeptides()) { String peptideSequence = peptide.getPeptideSequence(); Set<String> proteinsThisPeptide = peptideProteinMap.get(peptideSequence); if (proteinsThisPeptide == null) { proteinsThisPeptide = new HashSet<String>(); peptideProteinMap.put(peptideSequence, proteinsThisPeptide); } proteinsThisPeptide.addAll(proteinNames); } } } return peptideProteinMap; } /** * Look inside a protXML file to find the source PepXML files */ public static List<File> findSourcePepXMLFiles(File protXmlFile) throws FileNotFoundException, XMLStreamException { FileInputStream fIn = new FileInputStream(protXmlFile); SimpleXMLStreamReader parser = new SimpleXMLStreamReader(fIn); List<File> result = new ArrayList<File>(); _log.debug("findSourcePepXMLFiles"); while (parser.hasNext()) { if (!parser.skipToStart("protein_summary_header")) { _log.debug("No protein_summary_header in this protXML file!"); break; } _log.debug("findSourcePepXMLFiles: found protein_summary_header"); String sourceFilesString = parser.getAttributeValue(null, "source_files"); String[] sourceFilePathsArray = sourceFilesString.split(" "); _log.debug("findSourcePepXMLFiles: source_files='" + sourceFilesString + "'"); for (String sourceFilePath : sourceFilePathsArray) { File pepXmlFile = new File(sourceFilePath); _log.debug("findSourcePepXMLFiles: source file " + sourceFilePath); if (!pepXmlFile.exists() || !pepXmlFile.canRead()) throw new FileNotFoundException("Can't read PepXML file " + pepXmlFile.getAbsolutePath()); result.add(pepXmlFile); } } if (result.isEmpty()) throw new FileNotFoundException("No PepXML file specified in ProtXML file " + protXmlFile.getAbsolutePath()); return result; } /** * Generate a sensitivity-and-specificity-curve chart * @param protXmlFile * @return * @throws FileNotFoundException * @throws XMLStreamException */ public static PanelWithChart generateSensSpecChart(File protXmlFile) throws FileNotFoundException, XMLStreamException { FileInputStream fIn = new FileInputStream(protXmlFile); SimpleXMLStreamReader parser = new SimpleXMLStreamReader(fIn); List<Double> minProbValues = new ArrayList<Double>(); List<Double> sensValues = new ArrayList<Double>(); List<Double> specValues = new ArrayList<Double>(); while (parser.hasNext()) { if (!parser.skipToStart("protein_summary_data_filter")) { break; } minProbValues.add((double) Float.parseFloat(parser.getAttributeValue(null, "min_probability"))); sensValues.add((double) Float.parseFloat(parser.getAttributeValue(null, "sensitivity"))); specValues.add((double) Float.parseFloat(parser.getAttributeValue(null, "false_positive_error_rate"))); } if (minProbValues.size() == 0) throw new XMLStreamException("No sensitivity/specificity data found in file " + protXmlFile.getAbsolutePath()); PanelWithLineChart sensSpecChart = new PanelWithLineChart(minProbValues, sensValues, "Sensitivity"); sensSpecChart.addData(minProbValues, specValues, "Specificity"); sensSpecChart.setName("Sens/Spec"); return sensSpecChart; } /** * Generate a sensitivity-and-specificity-curve chart for two files. * Second one will have dashed lines * @param protXmlFile1 * @param protXmlFile2 * @return * @throws FileNotFoundException * @throws XMLStreamException */ public static PanelWithChart generateSensSpecChart(File protXmlFile1, File protXmlFile2) throws FileNotFoundException, XMLStreamException { FileInputStream fIn1 = new FileInputStream(protXmlFile1); SimpleXMLStreamReader parser1 = new SimpleXMLStreamReader(fIn1); List<Double> minProbValues1 = new ArrayList<Double>(); List<Double> sensValues1 = new ArrayList<Double>(); List<Double> specValues1 = new ArrayList<Double>(); while (parser1.hasNext()) { if (!parser1.skipToStart("protein_summary_data_filter")) { break; } minProbValues1.add((double) Float.parseFloat(parser1.getAttributeValue(null, "min_probability"))); sensValues1.add((double) Float.parseFloat(parser1.getAttributeValue(null, "sensitivity"))); specValues1.add((double) Float.parseFloat(parser1.getAttributeValue(null, "false_positive_error_rate"))); } if (minProbValues1.size() == 0) throw new XMLStreamException("No sensitivity/specificity data found in file " + protXmlFile1.getAbsolutePath()); FileInputStream fIn2 = new FileInputStream(protXmlFile2); SimpleXMLStreamReader parser2 = new SimpleXMLStreamReader(fIn2); List<Double> minProbValues2 = new ArrayList<Double>(); List<Double> sensValues2 = new ArrayList<Double>(); List<Double> specValues2 = new ArrayList<Double>(); while (parser2.hasNext()) { if (!parser2.skipToStart("protein_summary_data_filter")) { break; } minProbValues2.add((double) Float.parseFloat(parser2.getAttributeValue(null, "min_probability"))); sensValues2.add((double) Float.parseFloat(parser2.getAttributeValue(null, "sensitivity"))); specValues2.add((double) Float.parseFloat(parser2.getAttributeValue(null, "false_positive_error_rate"))); } if (minProbValues2.size() == 0) throw new XMLStreamException("No sensitivity/specificity data found in file " + protXmlFile2.getAbsolutePath()); PanelWithLineChart sensSpecChart = new PanelWithLineChart(minProbValues1, sensValues1, "Sensitivity, MS2 Only"); sensSpecChart.addData(minProbValues1, specValues1, "Specificity, MS2 Only"); float dash[] = { 10.0f }; Stroke dashedStroke = new BasicStroke(1.0f, BasicStroke.CAP_BUTT, BasicStroke.JOIN_MITER, 10.0f, dash, 0.0f); sensSpecChart.addData(minProbValues2, sensValues2, "Sensitivity, Augmented"); sensSpecChart.addData(minProbValues2, specValues2, "Specificity, Augmented"); sensSpecChart.setSeriesColor(0, Color.BLUE); sensSpecChart.setSeriesColor(1, Color.RED); sensSpecChart.setSeriesColor(2, Color.BLUE); sensSpecChart.getRenderer().setSeriesStroke(2, dashedStroke); sensSpecChart.setSeriesColor(3, Color.RED); sensSpecChart.getRenderer().setSeriesStroke(3, dashedStroke); sensSpecChart.setName("Sens/Spec"); return sensSpecChart; } /** * Create a mapping between all peptides noted in the protXml file, and all protein groups * that they are associated with. * @param protXmlFile * @return * @throws org.fhcrc.cpl.toolbox.commandline.CommandLineModuleExecutionException */ public static Map<String, Set<Integer>> loadPeptideProteinGroupMapFromProtXML(File protXmlFile, double minProteinProphet) throws FileNotFoundException, XMLStreamException { ProtXmlReader protXmlReader = new ProtXmlReader(protXmlFile); //build a map from each peptide to all proteins it supports Map<String, Set<Integer>> peptideProteinGroupMap = new HashMap<String, Set<Integer>>(); Iterator<ProteinGroup> iterator = protXmlReader.iterator(); while (iterator.hasNext()) { ProteinGroup group = iterator.next(); if (minProteinProphet > 0 && group.getGroupProbability() < minProteinProphet) continue; for (ProtXmlReader.Protein protXmlReaderProtein : group.getProteins()) { for (ProtXmlReader.Peptide peptide : protXmlReaderProtein.getPeptides()) { String peptideSequence = peptide.getPeptideSequence(); Set<Integer> proteinGroupsThisPeptide = peptideProteinGroupMap.get(peptideSequence); if (proteinGroupsThisPeptide == null) { proteinGroupsThisPeptide = new HashSet<Integer>(); peptideProteinGroupMap.put(peptideSequence, proteinGroupsThisPeptide); } proteinGroupsThisPeptide.add(group.getGroupNumber()); } } } return peptideProteinGroupMap; } /** * Create a mapping between all proteins in the protxml file, and all peptides * associated with them. This means all indistinguishable proteins * @param protXmlFile * @return * @throws org.fhcrc.cpl.toolbox.commandline.CommandLineModuleExecutionException */ public static Map<String, List<ProtXmlReader.Peptide>> loadProteinPeptideMapFromProtXML(File protXmlFile, double minProteinProphet) throws FileNotFoundException, XMLStreamException { ProtXmlReader protXmlReader = new ProtXmlReader(protXmlFile); //build a map from each protein to its peptide support Map<String, List<ProtXmlReader.Peptide>> proteinPeptideMap = new HashMap<String, List<ProtXmlReader.Peptide>>(); Iterator<ProteinGroup> iterator = protXmlReader.iterator(); int numGroups=0; while (iterator.hasNext()) { numGroups++; ProteinGroup group = iterator.next(); if (minProteinProphet > 0 && group.getGroupProbability() < minProteinProphet) continue; for (ProtXmlReader.Protein protXmlReaderProtein : group.getProteins()) { Set<String> proteinNames = new HashSet<String>(); //add first protein proteinNames.add(protXmlReaderProtein.getProteinName()); //add indistinguishable proteins proteinNames.addAll(protXmlReaderProtein.getIndistinguishableProteinNames()); for (String proteinName : proteinNames) { List<ProtXmlReader.Peptide> peptidesThisProtein = proteinPeptideMap.get(proteinName); if (peptidesThisProtein == null) { peptidesThisProtein = new ArrayList<ProtXmlReader.Peptide>(); proteinPeptideMap.put(proteinName, peptidesThisProtein); } peptidesThisProtein.addAll(protXmlReaderProtein.getPeptides()); } } } _log.debug("loaded " + numGroups + " protein groups from file " + protXmlFile.getAbsolutePath()); return proteinPeptideMap; } /** * Create a mapping between all proteins in the protxml file, and all peptides * associated with them. This means all indistinguishable proteins * @param protXmlFile * @return * @throws org.fhcrc.cpl.toolbox.commandline.CommandLineModuleExecutionException */ public static Map<String, Set<String>> loadProteinPeptideSequenceMapFromProtXML(File protXmlFile, double minProteinProphet) throws FileNotFoundException, XMLStreamException { ProtXmlReader protXmlReader = new ProtXmlReader(protXmlFile); //build a map from each protein to its peptide support Map<String, Set<String>> proteinPeptideMap = new HashMap<String, Set<String>>(); Iterator<ProteinGroup> iterator = protXmlReader.iterator(); Set<String> allPeptideSet = new HashSet<String>(); int numGroups=0; while (iterator.hasNext()) { numGroups++; ProteinGroup group = iterator.next(); if (minProteinProphet > 0 && group.getGroupProbability() < minProteinProphet) continue; for (ProtXmlReader.Protein protXmlReaderProtein : group.getProteins()) { Set<String> proteinNames = new HashSet<String>(); //add first protein proteinNames.add(protXmlReaderProtein.getProteinName()); //add indistinguishable proteins proteinNames.addAll(protXmlReaderProtein.getIndistinguishableProteinNames()); Set<String> peptideSet = new HashSet<String>(); for (ProtXmlReader.Peptide peptide : protXmlReaderProtein.getPeptides()) { String peptideSequence = peptide.getPeptideSequence(); peptideSet.add(peptideSequence); } for (String proteinName : proteinNames) { Set<String> peptidesThisProtein = proteinPeptideMap.get(proteinName); if (peptidesThisProtein == null) { peptidesThisProtein = new HashSet<String>(); proteinPeptideMap.put(proteinName, peptidesThisProtein); } peptidesThisProtein.addAll(peptideSet); allPeptideSet.addAll(peptidesThisProtein); } } } _log.debug("loaded " + numGroups + " protein groups, with " + allPeptideSet.size() + " unique peptides, from file " + protXmlFile.getAbsolutePath()); return proteinPeptideMap; } /** * Create a mapping between all proteins in the protxml file, and all peptides * associated with them. This means all indistinguishable proteins * @param protXmlFile * @return * @throws org.fhcrc.cpl.toolbox.commandline.CommandLineModuleExecutionException */ public static Map<String, Float> loadProteinProbabilityMapFromProtXML(File protXmlFile) throws FileNotFoundException, XMLStreamException { ProtXmlReader protXmlReader = new ProtXmlReader(protXmlFile); //build a map from each protein to its highest score Map<String, Float> proteinScoreMap = new HashMap<String, Float>(); Iterator<ProteinGroup> iterator = protXmlReader.iterator(); while (iterator.hasNext()) { ProteinGroup group = iterator.next(); float groupScore = group.getProbability(); for (ProtXmlReader.Protein protXmlReaderProtein : group.getProteins()) { Set<String> proteinNames = new HashSet<String>(); //add first protein proteinNames.add(protXmlReaderProtein.getProteinName()); //add indistinguishable proteins proteinNames.addAll(protXmlReaderProtein.getIndistinguishableProteinNames()); for (String proteinName : proteinNames) { Float scoreThisProtein = proteinScoreMap.get(proteinName); if (scoreThisProtein == null) { scoreThisProtein = protXmlReaderProtein.getProbability(); proteinScoreMap.put(proteinName, scoreThisProtein); } if (groupScore > scoreThisProtein) proteinScoreMap.put(proteinName, groupScore); } } } return proteinScoreMap; } /** * WARNING! after running through the iterator, the proteins in each group disappear * @param protXmlFile * @return * @throws FileNotFoundException * @throws XMLStreamException */ public static List<ProteinGroup> loadProteinGroupsFromProtXML(File protXmlFile) throws FileNotFoundException, XMLStreamException { ProtXmlReader protXmlReader = new ProtXmlReader(protXmlFile); ProtXmlReader.ProteinGroupIterator pgIterator = protXmlReader.iterator(); List<ProteinGroup> proteinGroupList = new ArrayList<ProteinGroup>(); while (pgIterator.hasNext()) { ProteinGroup pg = pgIterator.next(); //if I don't reference the proteins list, it disappears //todo: maybe do this in some better way? pg.getProteins().size(); proteinGroupList.add(pg); } return proteinGroupList; } public static List<ProtXmlReader.Protein> loadProtXmlProteinsFromProtXML(File protXmlFile) throws FileNotFoundException, XMLStreamException { ProtXmlReader protXmlReader = new ProtXmlReader(protXmlFile); ProtXmlReader.ProteinGroupIterator pgIterator = protXmlReader.iterator(); List<ProtXmlReader.Protein> result = new ArrayList<ProtXmlReader.Protein>(); int numProteinGroups = 0; while (pgIterator.hasNext()) { ProteinGroup proteinGroup = pgIterator.next(); for (ProtXmlReader.Protein protein : proteinGroup.getProteins()) { result.add(protein); } numProteinGroups++; } _log.debug("Loaded proteins from " + numProteinGroups + " protein groups"); return result; } /** * returns null if protein not found with a minimum group probability * @param protXmlFile * @param proteinName * @param minProteinProphetGroupProbability * @return * @throws FileNotFoundException * @throws XMLStreamException */ public static ProtXmlReader.Protein loadFirstProteinOccurrence(File protXmlFile, String proteinName, float minProteinProphetGroupProbability) throws FileNotFoundException, XMLStreamException { List<String> proteins = new ArrayList<String>(); proteins.add(proteinName); return loadFirstProteinOccurrence(protXmlFile, proteins, minProteinProphetGroupProbability).get(proteinName); } /** * returns null if protein not found * @param protXmlFile * @param proteinName * @return * @throws FileNotFoundException * @throws XMLStreamException */ public static ProtXmlReader.Protein loadFirstProteinOccurrence(File protXmlFile, String proteinName) throws FileNotFoundException, XMLStreamException { return loadFirstProteinOccurrence(protXmlFile, proteinName, 0f); } /** * Returns a map from protein names to first occurrences of proteins in the protXML file, * if they exist with minimum probability * @param protXmlFile * @param proteinNames * @param minProteinProphetGroupProbability * @return * @throws FileNotFoundException * @throws XMLStreamException */ public static Map<String, ProtXmlReader.Protein> loadFirstProteinOccurrence(File protXmlFile, Collection<String> proteinNames, float minProteinProphetGroupProbability) throws FileNotFoundException, XMLStreamException { Map<String, ProtXmlReader.Protein> result = new HashMap<String, ProtXmlReader.Protein>(); ProtXmlReader.ProteinGroupIterator proteinIterator = new ProtXmlReader(protXmlFile).iterator(); while (proteinIterator.hasNext()) { Set<String> remainingProteinNames = new HashSet<String>(proteinNames); ProteinGroup proteinGroup = proteinIterator.next(); if (proteinGroup.getProbability() < minProteinProphetGroupProbability) continue; for (ProtXmlReader.Protein protein : proteinGroup.getProteins()) { List<String> allProteinNamesThisProtein = new ArrayList<String>(); allProteinNamesThisProtein.add(protein.getProteinName()); if (protein.getIndistinguishableProteinNames() != null) { allProteinNamesThisProtein.addAll(protein.getIndistinguishableProteinNames()); } List<String> matchedNamesThisProtein = new ArrayList<String>(); for (String proteinName : allProteinNamesThisProtein) { if (proteinNames.contains(proteinName)) matchedNamesThisProtein.add(proteinName); } if (!matchedNamesThisProtein.isEmpty()) { for (String proteinName : matchedNamesThisProtein) { result.put(proteinName, protein); remainingProteinNames.remove(proteinName); } } } } return result; } /** * Create a pepxml file that can be used with proteinprophet */ public static void createPepXml(Feature[] featuresWithPeptides, File fastaFile, File outputFile) throws CommandLineModuleExecutionException { for (Feature feature : featuresWithPeptides) { //assign all features a peptide ID probability of 1 MS2ExtraInfoDef.setPeptideProphet(feature, 1f); feature.setProperty("protein",null); //proteinprophet can't handle charges >3 if (feature.getCharge() > 3) feature.setCharge(3); } assignContainingProteinsToFeatures(featuresWithPeptides, fastaFile); FeaturePepXmlWriter pepXmlWriter = new FeaturePepXmlWriter(featuresWithPeptides, null); pepXmlWriter.setSearchDatabase(fastaFile.getAbsolutePath()); try { pepXmlWriter.write(outputFile); ApplicationContext.infoMessage("Wrote output file " + outputFile.getAbsolutePath()); } catch (Exception e) { _log.error("Failed to save pepXML",e); } } public static Map<String, Protein> loadProteinNameProteinMapFromFasta(File fastaFile) { Map<String, Protein> result = new HashMap<String, Protein>(); FastaLoader fastaLoader = new FastaLoader(fastaFile); FastaLoader.ProteinIterator iterator = fastaLoader.iterator(); while (iterator.hasNext()) { Protein protein = iterator.next(); result.put(protein.getLookup(), protein); } return result; } /** * Load all proteins from a fasta file * @param fastaFile * @return */ public static ArrayList<Protein> loadProteinsFromFasta(File fastaFile) { ArrayList<Protein> proteinArray = new ArrayList<Protein>(); FastaLoader fastaLoader = new FastaLoader(fastaFile); FastaLoader.ProteinIterator iterator = fastaLoader.iterator(); while (iterator.hasNext()) { Protein protein = iterator.next(); proteinArray.add(protein); } return proteinArray; } /** * Load a map from protein ID to protein sequence * @param fastaFile * @return */ public static Map<String, String> loadProteinSequenceMapFromFasta(File fastaFile) { List<Protein> proteins = loadProteinsFromFasta(fastaFile); Map<String, String> result = new HashMap<String, String>(); for (Protein protein : proteins) { result.put(protein.getLookup(), protein.getSequenceAsString()); } return result; } public static Set<String> loadTrypticPeptidesFromFasta(File fastaFile) { FastaLoader fastaLoader = new FastaLoader(fastaFile); FastaLoader.ProteinIterator iterator = fastaLoader.iterator(); Set<String> peptides = new HashSet<String>(); PeptideGenerator pg = new PeptideGenerator(); while (iterator.hasNext()) { Protein protein = iterator.next(); Peptide[] peptidesThisProtein = pg.digestProtein(protein); for (Peptide peptideThisProtein : peptidesThisProtein) peptides.add(new String(peptideThisProtein.getChars())); } return peptides; } public static Map<String, List<String>> loadTrypticPeptideProteinMapFromFasta(File fastaFile) { return loadTrypticPeptideProteinMapFromFasta(fastaFile, 0); } public static Map<String, List<String>> loadTrypticPeptideProteinMapFromFasta(File fastaFile, int maxMissedCleavages) { FastaLoader fastaLoader = new FastaLoader(fastaFile); FastaLoader.ProteinIterator iterator = fastaLoader.iterator(); PeptideGenerator pg = new PeptideGenerator(); pg.setMaxMissedCleavages(maxMissedCleavages); Map<String, List<String>> result = new HashMap<String, List<String>>(); while (iterator.hasNext()) { Protein protein = iterator.next(); Peptide[] peptidesThisProtein = pg.digestProtein(protein); for (Peptide peptideThisProtein : peptidesThisProtein) { String peptide = new String(peptideThisProtein.getChars()); List<String> proteinsThisPeptide = result.get(peptide); if (proteinsThisPeptide == null) { proteinsThisPeptide = new ArrayList<String>(); result.put(peptide, proteinsThisPeptide); } proteinsThisPeptide.add(protein.getLookup()); } } return result; } public static List<String> getTrypticPeptidesForProtein(Protein protein, int maxMissedCleavages) { PeptideGenerator pg = new PeptideGenerator(); pg.setMaxMissedCleavages(maxMissedCleavages); List<String> result = new ArrayList<String>(); Peptide[] peptidesThisProtein = pg.digestProtein(protein); for (Peptide peptideThisProtein : peptidesThisProtein) { result.add(new String(peptideThisProtein.getChars())); } return result; } public static Map<String, Set<String>> findFastaProteinsForPeptides(Collection<String> peptideList, File fastaFile) { FastaLoader fastaLoader = new FastaLoader(fastaFile); FastaLoader.ProteinIterator iterator = fastaLoader.iterator(); Map<String, Set<String>> result = new HashMap<String, Set<String>>(); PeptideGenerator pg = new PeptideGenerator(); while (iterator.hasNext()) { Protein protein = iterator.next(); Peptide[] peptidesThisProtein = pg.digestProtein(protein); for (Peptide peptideThisProtein : peptidesThisProtein) { String peptideSequence = new String(peptideThisProtein.getChars()); if (peptideList.contains(peptideSequence)) { Set<String> proteinsThisPeptide = result.get(peptideSequence); if (proteinsThisPeptide == null) { proteinsThisPeptide = new HashSet<String>(); result.put(peptideSequence, proteinsThisPeptide); } proteinsThisPeptide.add(protein.getName()); } } } return result; } /** * * @param ms1FeaturesWithPeptides * @param fastaFile */ public static void assignContainingProteinsToFeatures(Feature[] ms1FeaturesWithPeptides, File fastaFile) { Protein[] proteinsInFasta = loadProteinsFromFasta(fastaFile).toArray(new Protein[0]); for (int i=0; i<ms1FeaturesWithPeptides.length; i++) { Feature feature = ms1FeaturesWithPeptides[i]; if (i % (ms1FeaturesWithPeptides.length/100) == 0) ApplicationContext.setMessage(Rounder.round((((float) i) * 100f/ms1FeaturesWithPeptides.length),0) + "% complete"); String peptideSequence = MS2ExtraInfoDef.getFirstPeptide(feature); if (peptideSequence == null) continue; for (Protein protein : proteinsInFasta) { String proteinSequence = protein.getSequenceAsString(); if (proteinSequence.contains(peptideSequence)) { MS2ExtraInfoDef.addProtein(feature, protein.getLookup()); } } } } /** * Map peptides to proteins using multiple protxml files * @param peptides * @param protXmlFiles * @param proteinsInFasta * @return * @throws FileNotFoundException * @throws XMLStreamException */ public static Map<String,List<Protein>> mapPeptidesToProteins(Set<String> peptides, File[] protXmlFiles, Protein[] proteinsInFasta, double minProteinProphet) throws FileNotFoundException, XMLStreamException { Map<String,List<Protein>> result = new HashMap<String,List<Protein>>(); for (File protXmlFile : protXmlFiles) { Map<String,List<Protein>> mapThisFile = mapPeptidesToProteins(peptides, protXmlFile, proteinsInFasta, minProteinProphet); for (String peptide : mapThisFile.keySet()) { if (result.containsKey(peptide)) { List<Protein> existingProteins = mapThisFile.get(peptide); for (Protein newProtein : mapThisFile.get(peptide)) { boolean foundIt = false; for (Protein existingProtein : existingProteins) { if (existingProtein.getLookup().equals(newProtein.getLookup())) { foundIt = true; break; } } if (!foundIt) existingProteins.add(newProtein); } } else { result.put(peptide, mapThisFile.get(peptide)); } } } return result; } /** * Map peptides to proteins. If the user has supplied a fasta file, do this based on * sequence. If the user has supplied a protxml file, do this based on that. * * protxml file takes precedence * @param peptides * @return */ public static Map<String,List<Protein>> mapPeptidesToProteins(Set<String> peptides, File protXmlFile, Protein[] proteinsInFasta, double minProteinProphet) throws FileNotFoundException, XMLStreamException { Map<String,List<Protein>> result = new HashMap<String,List<Protein>>(); int i=0; if (protXmlFile != null) { Map<String, Set<String>> peptideIPIMap = loadPeptideProteinMapFromProtXML(protXmlFile,minProteinProphet); for (String peptide : peptideIPIMap.keySet()) { List<Protein> protList = new ArrayList<Protein>(); for(String ipi : peptideIPIMap.get(peptide)) { Protein protein = new Protein(ipi, "".getBytes()); protList.add(protein); } result.put(peptide, protList); } } else { for (String peptide : peptides) { if (i++ % 100 == 0) ApplicationContext.setMessage("Protein mapping peptide " + i + " / " + peptides.size()); for (Protein protein : proteinsInFasta) { String proteinSequence = protein.getSequenceAsString(); if (proteinSequence.contains(peptide)) { List<Protein> proteinList = result.get(peptide); if (proteinList == null) { proteinList = new ArrayList<Protein>(); result.put(peptide, proteinList); } proteinList.add(protein); } } } } return result; } /** * helper method for one featureset * @param featureSet * @param fastaFile */ public static void guessProteinsForFeaturePeptides(FeatureSet featureSet, File fastaFile) { guessProteinsForFeaturePeptides(new FeatureSet[] { featureSet }, fastaFile); } /** * helper method for one featureset * @param featureSet * @param fastaProteins */ public static void guessProteinsForFeaturePeptides(FeatureSet featureSet, Protein[] fastaProteins) { guessProteinsForFeaturePeptides(new FeatureSet[] { featureSet }, null, fastaProteins); } /** * cover method * @param featureSets * @param fastaFile */ public static void guessProteinsForFeaturePeptides(FeatureSet[] featureSets, File fastaFile) { List<Protein> fastaProteins = ProteinUtilities.loadProteinsFromFasta(fastaFile); guessProteinsForFeaturePeptides(featureSets, fastaFile, fastaProteins.toArray(new Protein[fastaProteins.size()])); } /** * For every feature with a peptide in every featureset passed in, find some protein in the fasta * file that contains that peptide, and assign it * @param featureSets * @param fastaProteins */ public static void guessProteinsForFeaturePeptides(FeatureSet[] featureSets, File fastaFile, Protein[] fastaProteins) { Set<String> peptidesRemainingInAllFiles = new HashSet<String>(); for (FeatureSet featureSet : featureSets) { String path = "unknown"; if (featureSet.getSourceFile() != null) path = featureSet.getSourceFile().getAbsolutePath(); ApplicationContext.setMessage("Getting peptides from file " + path); for (Feature feature : featureSet.getFeatures()) { String peptide = MS2ExtraInfoDef.getFirstPeptide(feature); if (peptide != null) peptidesRemainingInAllFiles.add(peptide); } } ApplicationContext.setMessage("Total peptides: " + peptidesRemainingInAllFiles.size()); ApplicationContext.setMessage("Looking for tryptic peptides..."); Map<String, Protein> peptideProteinMap = new HashMap<String, Protein>(); PeptideGenerator pg = new PeptideGenerator(); pg.setMaxMissedCleavages(2); for (Protein protein : fastaProteins) { Peptide[] peptidesThisProtein = pg.digestProtein(protein); for (Peptide peptideThisProtein : peptidesThisProtein) { String peptideSequence = new String(peptideThisProtein.getChars()); if (peptidesRemainingInAllFiles.contains(peptideSequence)) { peptideProteinMap.put(peptideSequence, protein); peptidesRemainingInAllFiles.remove(peptideSequence); } } } ApplicationContext.setMessage("After tryptic digest, " + peptidesRemainingInAllFiles.size() + " peptides remain."); int numProteins = fastaProteins.length; int currentProteinIndex = 0; //the PeptideGenerator method won't always work -- semitryptic searches, etc. if (peptidesRemainingInAllFiles.size() > 0) { ApplicationContext.setMessage("Doing nontryptic on " + peptidesRemainingInAllFiles.size() + " remaining peptides..."); for (Protein protein : fastaProteins) { if (peptidesRemainingInAllFiles.size() == 0) break; if (currentProteinIndex % (Math.max(numProteins / 100,1)) == 0) ApplicationContext.setMessage("\t" + (currentProteinIndex * 100f / numProteins) + "% complete... remaining peptides: " + peptidesRemainingInAllFiles.size()); currentProteinIndex++; Set<String> peptidesFound = new HashSet<String>(); String proteinSequence = protein.getSequenceAsString(); for (String peptide : peptidesRemainingInAllFiles) { if (proteinSequence.contains(peptide)) { peptidesFound.add(peptide); peptideProteinMap.put(peptide, protein); } } peptidesRemainingInAllFiles.removeAll(peptidesFound); } } ApplicationContext.setMessage("All peptides assigned proteins"); for (FeatureSet featureSet : featureSets) { if (fastaFile != null) MS2ExtraInfoDef.setFeatureSetSearchDatabasePath(featureSet, fastaFile.getAbsolutePath()); for (Feature feature : featureSet.getFeatures()) { String peptide = MS2ExtraInfoDef.getFirstPeptide(feature); if (peptide != null) { Protein protein = peptideProteinMap.get(peptide); if (protein == null) { ApplicationContext.infoMessage("\tWARNING!!! no protein for peptide " + peptide); } else { List<String> proteinList = new ArrayList<String>(); proteinList.add(protein.getLookup()); MS2ExtraInfoDef.setProteinList(feature, proteinList); Pair<Character, Character> prevNextAAs = getPrevNextAAs(peptide, protein); char prevAA = prevNextAAs.first; char nextAA = prevNextAAs.second; MS2ExtraInfoDef.setPrevAminoAcid(feature, prevAA); MS2ExtraInfoDef.setNextAminoAcid(feature, nextAA); //WARNING WARNING WARNING!!! //This behavior is trypsin-specific. If another enzyme is used, number of enzymatic //ends will be set incorrectly. //check for start of protein sequence or trypsin digestion at start of peptide (remember proline) int numTrypticEnds = 0; if (prevAA == '-' || (prevAA == 'K' || prevAA == 'R') && !peptide.startsWith("P")) numTrypticEnds++; //check for end of protein sequence or trypsin digestion at end of peptide if (nextAA == ('-') || (nextAA != 'P' && (peptide.endsWith("K") || peptide.endsWith("R")))) numTrypticEnds++; //if (numTrypticEnds != 2) System.err.println("*** " + numTrypticEnds + ", " + peptide + ", " + prevAA + ", " + nextAA); MS2ExtraInfoDef.setNumEnzymaticEnds(feature, numTrypticEnds); } } } } } /** * For every feature with a peptide in every featureset passed in, find ALL proteins in the fasta * file that contains that peptide, and assign them all. This is much more computationally intensive than * finding just one * @param featureSets * @param fastaProteins */ public static void guessAllProteinsForFeaturePeptides(FeatureSet[] featureSets, File fastaFile, Protein[] fastaProteins) { Set<String> peptidesWithNoProteins = new HashSet<String>(); for (FeatureSet featureSet : featureSets) { ApplicationContext.setMessage("Getting peptides from file " + featureSet.getSourceFile().getAbsolutePath()); for (Feature feature : featureSet.getFeatures()) { String peptide = MS2ExtraInfoDef.getFirstPeptide(feature); if (peptide != null) peptidesWithNoProteins.add(peptide); } } ApplicationContext.setMessage("Total peptides: " + peptidesWithNoProteins.size()); ApplicationContext.setMessage("Looking for tryptic peptides..."); Map<String, List<Protein>> peptideProteinMap = new HashMap<String, List<Protein>>(); PeptideGenerator pg = new PeptideGenerator(); pg.setMaxMissedCleavages(2); for (Protein protein : fastaProteins) { Peptide[] peptidesThisProtein = pg.digestProtein(protein); for (Peptide peptideThisProtein : peptidesThisProtein) { String peptideSequence = new String(peptideThisProtein.getChars()); List<Protein> proteinsThisPeptide = peptideProteinMap.get(peptideSequence); if (proteinsThisPeptide == null) { proteinsThisPeptide = new ArrayList<Protein>(); peptideProteinMap.put(peptideSequence, proteinsThisPeptide); } proteinsThisPeptide.add(protein); if (peptidesWithNoProteins.contains(peptideSequence)) { peptidesWithNoProteins.remove(peptideSequence); } } } ApplicationContext.setMessage("After tryptic digest, " + peptidesWithNoProteins.size() + " peptides remain."); int numProteins = fastaProteins.length; int currentProteinIndex = 0; //the PeptideGenerator method won't always work -- semitryptic searches, etc. if (peptidesWithNoProteins.size() > 0) { ApplicationContext.setMessage("Doing nontryptic on " + peptidesWithNoProteins.size() + " remaining peptides..."); for (Protein protein : fastaProteins) { if (peptidesWithNoProteins.size() == 0) break; if (currentProteinIndex % (numProteins / 100) == 0) ApplicationContext.setMessage("\t" + (currentProteinIndex * 100f / numProteins) + "% complete... remaining peptides: " + peptidesWithNoProteins.size()); currentProteinIndex++; Set<String> peptidesFound = new HashSet<String>(); String proteinSequence = protein.getSequenceAsString(); for (String peptide : peptidesWithNoProteins) { if (proteinSequence.contains(peptide)) { peptidesFound.add(peptide); List<Protein> proteinsThisPeptide = peptideProteinMap.get(peptide); if (proteinsThisPeptide == null) { proteinsThisPeptide = new ArrayList<Protein>(); peptideProteinMap.put(peptide, proteinsThisPeptide); } proteinsThisPeptide.add(protein); } } } } ApplicationContext.setMessage("All peptides assigned proteins"); for (FeatureSet featureSet : featureSets) { if (fastaFile != null) MS2ExtraInfoDef.setFeatureSetSearchDatabasePath(featureSet, fastaFile.getAbsolutePath()); for (Feature feature : featureSet.getFeatures()) { String peptide = MS2ExtraInfoDef.getFirstPeptide(feature); if (peptide != null) { List<Protein> proteins = peptideProteinMap.get(peptide); if (proteins == null) { ApplicationContext.infoMessage("\tWARNING!!! no protein for peptide " + peptide); } else { List<String> proteinList = new ArrayList<String>(); for (Protein protein : proteins) proteinList.add(protein.getLookup()); MS2ExtraInfoDef.setProteinList(feature, proteinList); Pair<Character, Character> prevNextAAs = getPrevNextAAs(peptide, proteins.get(0)); char prevAA = prevNextAAs.first; char nextAA = prevNextAAs.second; MS2ExtraInfoDef.setPrevAminoAcid(feature, prevAA); MS2ExtraInfoDef.setNextAminoAcid(feature, nextAA); //WARNING WARNING WARNING!!! //This behavior is trypsin-specific. If another enzyme is used, number of enzymatic //ends will be set incorrectly. //check for start of protein sequence or trypsin digestion at start of peptide (remember proline) int numTrypticEnds = 0; if (prevAA == '-' || (prevAA == 'K' || prevAA == 'R') && !peptide.startsWith("P")) numTrypticEnds++; //check for end of protein sequence or trypsin digestion at end of peptide if (nextAA == ('-') || (nextAA != 'P' && (peptide.endsWith("K") || peptide.endsWith("R")))) numTrypticEnds++; //if (numTrypticEnds != 2) System.err.println("*** " + numTrypticEnds + ", " + peptide + ", " + prevAA + ", " + nextAA); MS2ExtraInfoDef.setNumEnzymaticEnds(feature, numTrypticEnds); } } } } } protected static Pair<Character, Character> getPrevNextAAs(String peptide, Protein protein) { Character prevAA = '-'; Character nextAA = '-'; String proteinSequence = protein.getSequenceAsString(); int prevAAIndex = proteinSequence.indexOf(peptide) - 1; if (prevAAIndex > 0) prevAA = proteinSequence.charAt(prevAAIndex); int nextAAIndex = proteinSequence.indexOf(peptide) + peptide.length(); if (nextAAIndex >= peptide.length() && nextAAIndex < proteinSequence.length()) nextAA = proteinSequence.charAt(nextAAIndex); return new Pair<Character, Character>(prevAA, nextAA); } }