/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.toolbox.proteomics; import org.fhcrc.cpl.toolbox.proteomics.filehandler.FastaLoader; import java.io.*; import java.util.*; /** * User: migra * Date: Jun 23, 2004 * Time: 2:33:14 PM * */ public class PeptideGenerator implements Runnable { public static double[] AMINO_ACID_AVERAGE_MASSES = getMasses(false); public static double[] AMINO_ACID_MONOISOTOPIC_MASSES = getMasses(true); //Index of H-ION in acid tables public static final int H_ION_INDEX = 0; public static final double ELECTRON_MASS = 5.485e-4; public static final int DIGEST_TRYPTIC = 1; public static final int DIGEST_ALL = 2; private String _inputFileName; private String _outputFileName; private boolean _computePI = true; private boolean _computeHp = true; private int _hpWindowSize = 9; private boolean _computeAverageMass = true; private boolean _computeMonoisotopicMass = true; private int _digest=DIGEST_TRYPTIC; private boolean _countOnly; private int _maxMissedCleavages = 0; private double _minMass = 400; private double _maxMass = 6400; private int _maxProteins = Integer.MAX_VALUE; private int _minResidues = -1; private int _maxResidues = Integer.MAX_VALUE; private double[] _massTab = AMINO_ACID_MONOISOTOPIC_MASSES; private boolean _async = true; private ArrayList _peptides; private ArrayList _listeners = new ArrayList(); private int _protNum = 0; private long _pepCount = 0; private long _aaCount = 0; public static void main(String[] args) { PeptideGenerator gp = initFromParams(args); if (null != gp) { if (!gp._countOnly) { System.out.print("prot"); System.out.print('\t'); System.out.print("pep"); if (gp._computeAverageMass) { System.out.print('\t'); System.out.print("m"); } if (gp._computeMonoisotopicMass) { System.out.print('\t'); System.out.print("mm"); } if (gp._computePI) { System.out.print('\t'); System.out.print("pi"); } if(gp._computeHp) { System.out.print('\t'); System.out.print("hp"); } System.out.println(); } Date d = new Date(); gp.run(); if (gp._countOnly) { System.out.print("Proteins: "); System.out.println(gp._protNum); System.out.print("Peptides: "); System.out.println(gp._pepCount); System.out.print("Residues: "); System.out.println(gp._aaCount); } System.err.println("Time: " + String.valueOf(new Date().getTime() - d.getTime())); } } public static PeptideGenerator initFromParams(String[] params) { if (params.length == 0) return paramError(); PeptideGenerator gp = new PeptideGenerator(); for (int i = 0; i < params.length; i++) { if (params[i].startsWith("-compute")) { HashSet computeSet = new HashSet(); String[] split1 = params[i].split("="); if (split1.length != 2) return paramError(); String[] split2 = split1[1].split(","); for (int j = 0; j < split2.length; j++) computeSet.add(split2[j].trim()); gp._computeAverageMass = computeSet.contains("m"); gp._computeMonoisotopicMass = computeSet.contains("mm"); gp._computeHp = computeSet.contains("hp"); gp._computePI = computeSet.contains("pi"); } else if (params[i].startsWith("-digest")) { String[] split = params[i].split("="); if (split.length != 2) return paramError(); String digest = split[1]; if ("tryptic".equals(digest)) gp._digest = DIGEST_TRYPTIC; else if ("all".equals(digest)) gp._digest = DIGEST_ALL; else return paramError(); } else if (params[i].startsWith("-countOnly")) { gp._countOnly = true; } else if (params[i].startsWith("-maxProteins")) { String[] split = params[i].split("="); if (split.length != 2) return paramError(); gp._maxProteins = Integer.parseInt(split[1]); } else if (params[i].startsWith("-massRange")) { String[] split = params[i].split("="); if (split.length != 2) return paramError(); String[] massStrings = split[1].split("-"); gp._minMass = Double.parseDouble(massStrings[0]); gp._maxMass = Double.parseDouble(massStrings[1]); } else if (params[i].startsWith("-maxMissed")) { String[] split = params[i].split("="); if (split.length != 2) return paramError(); gp._maxMissedCleavages = Integer.parseInt(split[1]); } else if (params[i].startsWith("-cys")) { String[] split = params[i].split("="); if (split.length != 2) return paramError(); double cys = Double.parseDouble(split[1]); double[] massTab = getMasses(true); massTab['C'] += cys; gp._massTab = massTab; } else if (!params[i].startsWith("-")) { gp._inputFileName = params[i]; } else { //Unknown option return paramError(); } } if (null == gp._inputFileName) return paramError(); File file = new File(gp._inputFileName); if (!file.exists()) { System.err.println("Could not find file: " + file.getAbsolutePath()); return null; } gp.addListener(gp.getCommandLineListener()); return gp; } private static PeptideGenerator paramError() { System.err.print("syntax: java PeptideGenerator inputFile [-compute={m,mm,pi,hp}] [-digest={tryptic|all}] [-massRange=min-max] [-countOnly] [-cys=m]"); return null; } public void run() { FastaLoader loader = null; try { File fastaFile = new File(_inputFileName); loader = new FastaLoader(fastaFile); } catch (Exception x) { x.printStackTrace(System.err); return; } Iterator it = loader.iterator(); double[] massTab = getMasses(false); if (_minResidues == -1) _minResidues = (int) (_minMass / massTab['W']); if (_maxResidues == Integer.MAX_VALUE) _maxResidues = (int) (_maxMass / massTab['G']); while (it.hasNext()) { _protNum++; Protein p = (Protein) it.next(); byte[] bytes = p.getBytes(); _aaCount += bytes.length; if (_digest == DIGEST_TRYPTIC) doProteinDigest(p); else doProteinAll(p); if (_protNum >= _maxProteins) break; } fireHandleDone(); } public Peptide[] digestProtein(Protein protein) { _async = false; _peptides = new ArrayList(); doProteinDigest(protein); return (Peptide[]) _peptides.toArray(new Peptide[_peptides.size()]); } /** * Create a single Peptide for a fully tryptic peptide sequence. * @param peptideSequence * @return */ public Peptide createPeptideForFullyTrypticPeptideSequence(String peptideSequence) { return digestProtein(new Protein("dummy", peptideSequence.getBytes()))[0]; } public void addListener(PeptideListener listener) { _listeners.add(listener); } public PeptideListener getCommandLineListener() { return new CommandLinePeptideListener(); } private void doProteinDigest(Protein protein) { byte[] bytes = protein.getBytes(); ArrayList rkSites = new ArrayList(); for (int i = 0; i < bytes.length; i++) { byte b = bytes[i]; if ((i + 1 == bytes.length || bytes[i + 1] != 'P') && (b == 'R' || b == 'K')) rkSites.add(new Integer(i)); } byte b = bytes[bytes.length -1]; if (b != 'K' && b != 'R') rkSites.add(new Integer(bytes.length - 1)); Integer[] rk = (Integer[]) rkSites.toArray(new Integer[rkSites.size()]); int prevSite = -1; int nextSite = 0; for (int i = 0; i < rk.length; i++) { double m = 0; for (int j = 0; j <= _maxMissedCleavages && i + j < rk.length; j++) { nextSite = rk[i + j].intValue(); if (nextSite - prevSite > _minResidues && nextSite - prevSite < _maxResidues) { Peptide pep = new Peptide(protein, prevSite + 1, nextSite - prevSite); m = pep.getMass(_massTab); if (m >= _minMass && m <= _maxMass) if (_async) fireHandlePeptide(pep); else _peptides.add(pep); } if (m > _maxMass) break; } prevSite = rk[i].intValue(); } } private void doProteinAll(Protein protein) { byte[] bytes = protein.getBytes(); double[] massTab = getMasses(false); for (int i = 0; i < bytes.length - _minResidues; i++) { double m = massTab['h'] + massTab['o'] + massTab['h']; for (int j = 0; j < _minResidues; j++) m += massTab[bytes[i + j]]; for (int j = _minResidues; j <= _maxResidues && i + j < bytes.length; j++) { m += massTab[bytes[i + j]]; //m = fireHandlePeptide(bytes, i, j); if (m > _maxMass) break; else if (m > _minMass) { if (_countOnly) _pepCount++; else fireHandlePeptide(new Peptide(protein, i, j + 1)); } if(_pepCount > 0 && _pepCount % 1000000 == 0) System.err.println(_pepCount); } } } private void fireHandlePeptide(Peptide peptide) { Iterator iter = _listeners.iterator(); while (iter.hasNext()) { PeptideListener listener = (PeptideListener) iter.next(); listener.handlePeptide(peptide); } } private void fireHandleDone() { Iterator iter = _listeners.iterator(); while (iter.hasNext()) { PeptideListener listener = (PeptideListener) iter.next(); listener.handleDone(); } } public static double computeMass(byte[] bytes, int start, int length, double[] massTab) { double pepMass = massTab['h'] + massTab['o'] + massTab['h']; for (int a = start; a < start + length; a++ ) pepMass += massTab[bytes[a]]; return pepMass; } /** * Taken from AminoAcidMasses.h on sourceforge.net. Returns 128 character array. * * Lower case indexes h, o, c, n, p, s are masses for corresponding elements * Upper case indexes are Amino Acid Masses. * * @param monoisotopic If true, return monoisotopic masses, otherwise average masses * @return */ public static double[] getMasses( boolean monoisotopic) { double[] aaMasses = new double[128]; if (!monoisotopic) { aaMasses['h']= 1.00794; /* hydrogen */ aaMasses['o']= 15.9994; /* oxygen */ aaMasses['c']= 12.0107; /* carbon */ aaMasses['n']= 14.00674; /* nitrogen */ aaMasses['p']= 30.973761; /* phosporus */ aaMasses['s']= 32.066; /* sulphur */ aaMasses['G']= 57.05192; aaMasses['A']= 71.07880; aaMasses['S']= 87.07820; aaMasses['P']= 97.11668; aaMasses['V']= 99.13256; aaMasses['T']=101.10508; aaMasses['C']=103.13880; /* 103.1448, 103.14080 */ aaMasses['L']=113.15944; aaMasses['I']=113.15944; aaMasses['X']=113.15944; aaMasses['N']=114.10384; aaMasses['O']=114.14720; aaMasses['B']=114.59622; aaMasses['D']=115.08860; aaMasses['Q']=128.13072; aaMasses['K']=128.17408; aaMasses['Z']=128.62310; aaMasses['E']=129.11548; aaMasses['M']=131.19256; /* 131.19456 131.1986 */ aaMasses['H']=137.14108; aaMasses['F']=147.17656; aaMasses['R']=156.18748; aaMasses['Y']=163.17596; aaMasses['W']=186.21320; } else /* monoisotopic masses */ { aaMasses['h']= 1.0078250; aaMasses['o']= 15.9949146; aaMasses['c']= 12.0000000; aaMasses['n']= 14.0030740; aaMasses['p']= 30.9737633; aaMasses['s']= 31.9720718; aaMasses['G']= 57.0214636; aaMasses['A']= 71.0371136; aaMasses['S']= 87.0320282; aaMasses['P']= 97.0527636; aaMasses['V']= 99.0684136; aaMasses['T']=101.0476782; aaMasses['C']=103.0091854; //Jason Hogan says 103.0091843, given Sulfur=31.9720707 (from NIST) aaMasses['L']=113.0840636; aaMasses['I']=113.0840636; aaMasses['X']=113.0840636; aaMasses['N']=114.0429272; aaMasses['O']=114.0793126; aaMasses['B']=114.5349350; aaMasses['D']=115.0269428; aaMasses['Q']=128.0585772; aaMasses['K']=128.0949626; aaMasses['Z']=128.5505850; aaMasses['E']=129.0425928; aaMasses['M']=131.0404854; //Jason Hogan says 131.0404843, given Sulfur=31.9720707 (from NIST) aaMasses['H']=137.0589116; aaMasses['F']=147.0684136; aaMasses['R']=156.1011106; aaMasses['Y']=163.0633282; aaMasses['W']=186.0793126; } aaMasses[H_ION_INDEX] = aaMasses['h'] - ELECTRON_MASS; return aaMasses; } /*ASSIGN_MASS*/ /** * PI Calculation */ private static final double PH_MIN = 0; /* minimum pH value */ private static final double PH_MAX =14; /* maximum pH value */ private static final double MAXLOOP = 2000; /* maximum number of iterations */ private static final double EPSI = 0.0001; /* desired precision */ /* the 7 amino acid which matter */ static int R = 'R' - 'A', H = 'H' - 'A', K = 'K' - 'A', D = 'D' - 'A', E = 'E' - 'A', C = 'C' - 'A', Y = 'Y' - 'A'; /* * table of pk values : * Note: the current algorithm does not use the last two columns. Each * row corresponds to an amino acid starting with Ala. J, O and U are * inexistant, but here only in order to have the complete alphabet. * * Ct Nt Sm Sc Sn */ static double[][] pk = new double[][] { /* A */ {3.55, 7.59, 0.0 , 0.0 , 0.0 }, /* B */ {3.55, 7.50, 0.0 , 0.0 , 0.0 }, /* C */ {3.55, 7.50, 9.00 , 9.00 , 9.00 }, /* D */ {4.55, 7.50, 4.05 , 4.05 , 4.05 }, /* E */ {4.75, 7.70, 4.45 , 4.45 , 4.45 }, /* F */ {3.55, 7.50, 0.0 , 0.0 , 0.0 }, /* G */ {3.55, 7.50, 0.0 , 0.0 , 0.0 }, /* H */ {3.55, 7.50, 5.98 , 5.98 , 5.98 }, /* I */ {3.55, 7.50, 0.0 , 0.0 , 0.0 }, /* J */ {0.00, 0.00, 0.0 , 0.0 , 0.0 }, /* K */ {3.55, 7.50, 10.00, 10.00, 10.00 }, /* L */ {3.55, 7.50, 0.0 , 0.0 , 0.0 }, /* M */ {3.55, 7.00, 0.0 , 0.0 , 0.0 }, /* N */ {3.55, 7.50, 0.0 , 0.0 , 0.0 }, /* O */ {0.00, 0.00, 0.0 , 0.0 , 0.0 }, /* P */ {3.55, 8.36, 0.0 , 0.0 , 0.0 }, /* Q */ {3.55, 7.50, 0.0 , 0.0 , 0.0 }, /* R */ {3.55, 7.50, 12.0 , 12.0 , 12.0 }, /* S */ {3.55, 6.93, 0.0 , 0.0 , 0.0 }, /* T */ {3.55, 6.82, 0.0 , 0.0 , 0.0 }, /* U */ {0.00, 0.00, 0.0 , 0.0 , 0.0 }, /* V */ {3.55, 7.44, 0.0 , 0.0 , 0.0 }, /* W */ {3.55, 7.50, 0.0 , 0.0 , 0.0 }, /* X */ {3.55, 7.50, 0.0 , 0.0 , 0.0 }, /* Y */ {3.55, 7.50, 10.00, 10.00, 10.00 }, /* Z */ {3.55, 7.50, 0.0 , 0.0 , 0.0 }, }; static double exp10(double value) { return Math.pow(10.0,value); } static double computePI(byte[] seq, int start, int seq_length) { int[] comp = new int[26]; /* Amino acid composition of the protein */ int nterm_res, /* N-terminal residue */ cterm_res; /* C-terminal residue */ int i; int charge_increment = 0; double charge, ph_mid = 0, ph_min, ph_max; double cter, nter; double carg, clys, chis, casp, cglu, ctyr, ccys; for (i = 0; i < seq_length; i++) /* compute the amino acid composition */ { comp[seq[i + start] - 'A']++; } nterm_res = seq[start] - 'A'; /* Look up N-terminal residue */ cterm_res = seq[start + seq_length-1] - 'A'; /* Look up C-terminal residue */ ph_min = PH_MIN; ph_max = PH_MAX; for (i = 0, charge = 1.0; i<MAXLOOP && (ph_max - ph_min)>EPSI; i++) { ph_mid = ph_min + (ph_max - ph_min) / 2.0; cter = exp10(-pk[cterm_res][0]) / (exp10(-pk[cterm_res][0]) + exp10(-ph_mid)); nter = exp10(-ph_mid) / (exp10(-pk[nterm_res][1]) + exp10(-ph_mid)); carg = comp[R] * exp10(-ph_mid) / (exp10(-pk[R][2]) + exp10(-ph_mid)); chis = comp[H] * exp10(-ph_mid) / (exp10(-pk[H][2]) + exp10(-ph_mid)); clys = comp[K] * exp10(-ph_mid) / (exp10(-pk[K][2]) + exp10(-ph_mid)); casp = comp[D] * exp10(-pk[D][2]) / (exp10(-pk[D][2]) + exp10(-ph_mid)); cglu = comp[E] * exp10(-pk[E][2]) / (exp10(-pk[E][2]) + exp10(-ph_mid)); ccys = comp[C] * exp10(-pk[C][2]) / (exp10(-pk[C][2]) + exp10(-ph_mid)); ctyr = comp[Y] * exp10(-pk[Y][2]) / (exp10(-pk[Y][2]) + exp10(-ph_mid)); charge = carg + clys + chis + nter + charge_increment - (casp + cglu + ctyr + ccys + cter); if (charge > 0.0) { ph_min = ph_mid; } else { ph_max = ph_mid; } } return ph_mid; } public String getInputFileName() { return _inputFileName; } public void setInputFileName(String inputFileName) { this._inputFileName = inputFileName; } public String getOutputFileName() { return _outputFileName; } public void setOutputFileName(String outputFileName) { this._outputFileName = outputFileName; } public boolean isComputePI() { return _computePI; } public void setComputePI(boolean computePI) { this._computePI = computePI; } public boolean isComputeHp() { return _computeHp; } public void setComputeHp(boolean computeHp) { this._computeHp = computeHp; } public int getHpWindowSize() { return _hpWindowSize; } public void setHpWindowSize(int hpWindowSize) { this._hpWindowSize = hpWindowSize; } public boolean isComputeAverageMass() { return _computeAverageMass; } public void setComputeAverageMass(boolean computeAverageMass) { this._computeAverageMass = computeAverageMass; } public boolean isComputeMonoisotopicMass() { return _computeMonoisotopicMass; } public void setComputeMonoisotopicMass(boolean computeMonoisotopicMass) { this._computeMonoisotopicMass = computeMonoisotopicMass; } public int getDigest() { return _digest; } public void setDigest(int digest) { this._digest = digest; } public boolean isCountOnly() { return _countOnly; } public void setCountOnly(boolean countOnly) { this._countOnly = countOnly; } public int getMaxMissedCleavages() { return _maxMissedCleavages; } public void setMaxMissedCleavages(int maxMissedCleavages) { _maxMissedCleavages = maxMissedCleavages; } public double getMinMass() { return _minMass; } public void setMinMass(double minMass) { _minMass = minMass; } public double getMaxMass() { return _maxMass; } public void setMaxMass(double maxMass) { _maxMass = maxMass; } public int getMinResidues() { return _minResidues; } public void setMinResidues(int minResidues) { _minResidues = minResidues; } public int getMaxResidues() { return _maxResidues; } public void setMaxResidues(int maxResidues) { _maxResidues = maxResidues; } public double[] getMassTable() { return _massTab; } public void setMassTable(double[] massTab) { _massTab = massTab; } public class CommandLinePeptideListener implements PeptideListener { PrintStream out; public CommandLinePeptideListener() { out = System.out; } public void handlePeptide(Peptide peptide) { double m = 0; m = peptide.getMass(); if (m > _minMass && m <= _maxMass) { _pepCount++; if (!_countOnly) { out.print(_protNum); out.print('\t'); out.print(peptide.getChars()); if (_computeAverageMass) { out.print('\t'); out.print(m); } if (_computeMonoisotopicMass) { out.print('\t'); out.print(peptide.getMonoisotopicMass()); } if (_computePI) { out.print('\t'); out.print(peptide.getPi()); } if(_computeHp) { out.print('\t'); out.print(peptide.getHydrophobicity()); } out.println(); } } } public void handleDone() { } } public static interface PeptideListener { public void handlePeptide(Peptide peptide); public void handleDone(); } }