/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.viewer.amt; import java.util.*; import java.io.*; import org.fhcrc.cpl.toolbox.proteomics.feature.Feature; import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef; import org.fhcrc.cpl.toolbox.proteomics.Protein; import org.fhcrc.cpl.toolbox.proteomics.filehandler.ProtXmlReader; import org.fhcrc.cpl.toolbox.proteomics.filehandler.ProteinGroup; import org.apache.log4j.Logger; public class AMTFileRestrictor { private static Logger _log = Logger.getLogger(AMTFileRestrictor.class); public static float DEFAULT_MIN_PROTEIN_PROPHET_FOR_RESTRICTION = .95f; /** * Load all proteins with a minimum protein prophet score from a protxml file * @return */ public static ArrayList<Protein> restrictProteinListWithProtXml(List<Protein> fullProteinList, File protXmlFile, float minProbability, int minPeptidesPerProtein) { ArrayList<Protein> proteinsInProtXml = new ArrayList<Protein>(); ProtXmlReader protXmlReader = new ProtXmlReader(protXmlFile); try { Iterator<ProteinGroup> iterator = protXmlReader.iterator(); while (iterator.hasNext()) { ProteinGroup group = iterator.next(); if (group.getGroupProbability() < minProbability) continue; for (int i = 0; i < group.getProteins().size(); i++) { // have to check all proteins in the group ProtXmlReader.Protein protXmlReaderProtein = group.getProteins().get(i); for (int j = 0; j < fullProteinList.size(); j++) { Protein currentProtein = fullProteinList.get(j); // protXml protein_name should be the whole start of the protein header // up to ' ' or carriage return if (currentProtein.getHeader().startsWith(protXmlReaderProtein.getProteinName()) && (currentProtein.getHeader().length() == protXmlReaderProtein.getProteinName().length() || currentProtein.getHeader().charAt(protXmlReaderProtein.getProteinName().length()) == ' ')) { if (protXmlReaderProtein.getTotalNumberPeptides() >= minPeptidesPerProtein) proteinsInProtXml.add(currentProtein); break; } } } } } catch (Exception e) { e.printStackTrace(System.err); return null; } return proteinsInProtXml; } /** * remove all features that match features that were used to support a given list * of proteins in a protxml file * @return */ public static ArrayList<Feature> removeFeaturesWithProteinsFromProtXml( Feature[] inputFeatures, List<Protein> proteinList, File protXmlFile) { //our output feature list starts out with everything ArrayList<Feature> outputFeatureList = new ArrayList<Feature>(inputFeatures.length); //ArrayList<Feature> matchedFeatureList = new ArrayList<Feature>(inputFeatures.length); for (Feature feature : inputFeatures) outputFeatureList.add(feature); ArrayList<Feature> identifiedMatchingFeatures = identifyCorrespondingProteinFeaturesInProtXml( inputFeatures, proteinList, protXmlFile); for (Feature matchingFeature : identifiedMatchingFeatures) { outputFeatureList.remove(matchingFeature); //matchedFeatureList.add(matchingFeature); } /* //HACK!!! FeaturePepXmlWriter pepXmlWriter = new FeaturePepXmlWriter(matchedFeatureList.toArray(new Feature[0]), null); try {pepXmlWriter.write(new File("/home/dhmay/temp/outunmatched.pep.xml"));} catch (Exception e){} //end HACK!!! */ return outputFeatureList; } /** * identify all features that match peptides that were used to support a given list * of proteins in a protxml file * @return */ public static ArrayList<Feature> identifyCorrespondingProteinFeaturesInProtXml( Feature[] inputFeatures, List<Protein> proteinList, File protXmlFile) { ArrayList<Feature> result = new ArrayList<Feature>(); ProtXmlReader protXmlReader = new ProtXmlReader(protXmlFile); int[] proteinBuckets = new int[30]; int[] peptideBuckets = new int[30]; HashMap <String,Integer> peptideHitMap = new HashMap<String,Integer>(); try { Iterator<ProteinGroup> iterator = protXmlReader.iterator(); while (iterator.hasNext()) { ProteinGroup group = iterator.next(); for (int i = 0; i < group.getProteins().size(); i++) { // have to check all proteins in the group ProtXmlReader.Protein protXmlReaderProtein = group.getProteins().get(i); for (Protein currentProtein: proteinList) { // protXml protein_name should be the whole start of the protein header // up to ' ' or carriage return if (currentProtein.getHeader().startsWith(protXmlReaderProtein.getProteinName()) && (currentProtein.getHeader().length() == protXmlReaderProtein.getProteinName().length() || currentProtein.getHeader().charAt(protXmlReaderProtein.getProteinName().length()) == ' ')) { int numPeptidesForProtein=0; //found one of our proteins! identify & remove all the peptides List<ProtXmlReader.Peptide> peptideList = protXmlReaderProtein.getPeptides(); for (ProtXmlReader.Peptide peptide : peptideList) { if (!peptide.isContributingEvidence()) continue; //for loop on inputfeatures, not outputfeatures, so messing //with the list doesn't mess up the loop for (Feature feature : inputFeatures) { if (peptide.getPeptideSequence().equalsIgnoreCase(MS2ExtraInfoDef.getFirstPeptide(feature)) && peptide.getCharge() == feature.getCharge() && peptide.getCalcNeutralPepMass() == feature.getMass()) { numPeptidesForProtein++; Integer whee = peptideHitMap.get(peptide.getPeptideSequence()); if (whee == null) peptideHitMap.put(peptide.getPeptideSequence(), new Integer(1)); else peptideHitMap.put(peptide.getPeptideSequence(),new Integer(whee+1)); result.add(feature); //System.err.println("Removed " + feature.getPeptide()); } } } proteinBuckets[numPeptidesForProtein]++; } } } } } catch (Exception e) { e.printStackTrace(System.err); return null; } int a=0; for (String whee : peptideHitMap.keySet()) { a++; peptideBuckets[peptideHitMap.get(whee)]++; } System.err.println("peptides: " + a); //for (String peptide : peptideHitMap.keySet()) System.err.println(peptide); for (int i=0; i<20; i++) System.err.println("Peptides: " + i + ", proteins: " + proteinBuckets[i]); for (int i=0; i<20; i++) System.err.println("Proteins: " + i + ", peptides: " + peptideBuckets[i]); return result; } }