/* * Copyright (C) Lennart Martens * * Contact: lennart.martens AT UGent.be (' AT ' to be replaced with '@') */ package com.compomics.util.protein; import com.compomics.util.general.IsotopicDistribution; import org.apache.log4j.Logger; import java.io.IOException; import java.util.*; import java.math.BigDecimal; import com.compomics.util.interfaces.Sequence; import com.compomics.util.interfaces.Modification; import com.compomics.util.general.MassCalc; import com.compomics.util.general.UnknownElementMassException; /* * CVS information: * * $Revision: 1.3 $ * $Date: 2007/07/06 09:41:54 $ */ /** * This class represents a sequence for a protein or peptide. <br> * * @see com.compomics.util.interfaces.Sequence * @author Lennart Martens */ public class AASequenceImpl implements Sequence { // Class specific log4j logger for AASequenceImpl instances. Logger logger = Logger.getLogger(AASequenceImpl.class); /** * The Kyte & Doolittle score for AA residues. */ private static Properties iKyte_Doolittle = null; /** * This variable holds the GRAVY (Kyte & Doolittle) score * for the peptide. <br /> * It uses lazy caching! */ private double iGravy = 0.0; /** * This boolean aids in the caching of the GRAVY value. */ private boolean iGravyCached = false; /** * The Meek HPLC retention score for AA residus. */ private static Properties iMeekList = null; /** * This variable holds the HPLC retention time (Meek) score * for the peptide. <br /> * It uses lazy caching! */ private double iMeek = 0.0; /** * This boolean aids in the caching of the Meek value. */ private boolean iMeekCached = false; /** * This varible holds the String that represents the * sequence. */ private String iSequence = null; /** * This Vector holds all the modifications currently on the * sequence. */ private Vector iModifications = null; /** * This variable holds the mass for the sequence. <br /> * It uses lazy caching! */ private double iMass = -1; /** * Constant for internal use. */ private static final int GRAVY = 0; /** * Constant for internal use. */ private static final int MEEK = 1; /** * Constructor that allows the initialization of the sequence * with a String variable representing that sequence. * * @param aSequence String with the sequence. */ public AASequenceImpl(String aSequence) { this(aSequence, null); } /** * Constructor that allows the initialization of the sequence * with a String variable representing that sequence and a * Vector of Modification instances. * * @param aSequence String with the sequence. * @param aMods Vector with Modification instances. */ public AASequenceImpl(String aSequence, Vector aMods) { this.setSequence(aSequence); this.setModifications(aMods); } /** * Default constructir is private. Used in static creation methods. */ private AASequenceImpl() {}; /** * Simple setter for the sequence. It also clears the mass cache. * * @param aSequence String with the sequence to be set. */ public void setSequence(String aSequence) { // Nullpointer check. if (aSequence == null) { throw new NullPointerException("Sequence cannot be 'null'!\n"); } else if (aSequence.trim().equals("")) { throw new IllegalArgumentException("Sequence cannot be empty String!\n"); } this.iSequence = aSequence.trim(); this.iMass = -1.0; this.iGravyCached = false; this.iMeekCached = false; } /** * Simple getter for the sequence. * * @return String the sequence. */ public String getSequence() { return this.iSequence; } /** * Simple setter for the modifications. * * @param aMods Vector with the modifications. */ public void setModifications(Vector aMods) { this.iModifications = aMods; } /** * Simple getter for the modifications. * * @return Vector with the modifications. */ public Vector getModifications() { return this.iModifications; } /** * This method calculates the mass over charge ratio for a given charge * for the current sequence. Returns -1, if the mass could not be calculated. * * @param charge the charge to use for the m/z ratio calculation * @return double with the m/z ratio for this sequence with the given charge, * -1 if the mass could not be calculated. */ public double getMz(int charge) { double tempMz = -1; try { // calculate the m/z ratio tempMz = (getMass() + (((double) charge) * new MassCalc().calculateMass("H"))) / charge; } catch (UnknownElementMassException ume) { logger.error(ume.getMessage(), ume); } return tempMz; } /** * This method calculates the mass for the current sequence. <br> * Mass cached lazily, so after the first calculation it comes from * memory. * * @return double with the mass for this sequence. */ public double getMass() { if (iMass < 0.0) { // We need to calculate the mass since it is not cached. try { MassCalc temp = new MassCalc(MassCalc.MONOAA); this.iMass = temp.calculateMass(iSequence); // Okay, we now have the mass as is. // Next up: apply all changes to the mass as indicated in // the Modifications we have (if any). if (iModifications != null) { int liSize = iModifications.size(); for (int i = 0; i < liSize; i++) { ModificationImplementation m = (ModificationImplementation) iModifications.get(i); int location = m.getLocation(); double delta = 0.0; if (location == 0) { delta = m.getMonoisotopicMassDelta(ModificationImplementation.NTERMINUS); } else if (location > iSequence.length()) { delta = m.getMonoisotopicMassDelta(ModificationImplementation.CTERMINUS); } else { String loc = iSequence.substring(location - 1, location); delta = m.getMonoisotopicMassDelta(loc) - (temp.calculateMass(loc) - 18.010565); } this.iMass += delta; } } } catch (UnknownElementMassException ume) { logger.error(ume.getMessage(), ume); } } return this.iMass; } /** * This method allows the construction of an AASequenceImpl object, complete with modifications * from an annotated sequence String (eg., something like: 'NH2-YS<P>FVATER-COOH' or 'Ace-MATHM<Mox>PIR-COOH'). * * @param aAnnotatedSequence String with the annotated sequence (eg., something like: * 'NH2-YS<P>FVATER-COOH' or 'Ace-MATHM<Mox>PIR-COOH') * @return AASequenceImpl with the sequence and annotated modifications. */ public static AASequenceImpl parsePeptideFromAnnotatedSequence(String aAnnotatedSequence) { AASequenceImpl p = new AASequenceImpl(); p.parseSequenceAndModificationsFromString(aAnnotatedSequence); // C'est fini! return p; } /** * This method is designed to load a sequence and it's set of modifications from a String * which holds the sequence, annotated with all the modifications applied to it. <br> * Typically, the String parsed should be derived from the 'String getModifiedSequence()' * method (see documentation there). * * @see com.compomics.util.interfaces.Modification * @param aStringWithModificiations String with annotated modifications. */ protected void parseSequenceAndModificationsFromString(String aStringWithModificiations) { // First isolate the N-terminal and C-terminal part of the sequence. // Structure is '[nterm] - [sequence_with_mods] - [cterm]' structure. int startSequence = aStringWithModificiations.indexOf("-"); int endSequence = aStringWithModificiations.lastIndexOf("-"); // Isolate the three parts. String nterm = aStringWithModificiations.substring(0, startSequence).trim(); String cterm = aStringWithModificiations.substring(endSequence + 1).trim(); String sequence = aStringWithModificiations.substring(startSequence + 1, endSequence).trim(); ArrayList modifications = new ArrayList(); // Parse the N-terminal modification. // Oh yeah, we only parse when there is a modification ('NH2' means no modification). if (!nterm.equals("NH2")) { // See if the factory knows about this one! Modification ntermMod = ModificationFactory.getModification(nterm, Modification.NTERMINUS, 0); if (ntermMod == null) { // Auwch... throw new IllegalArgumentException("N-terminal modification code '" + nterm + "' was not recognized for the N-terminus by the ModificationFactory!"); } modifications.add(ntermMod); } // Parse the C-terminal modification. // Oh yeah, we only parse when there is a modification ('COOH' means no modification). if (!cterm.equals("COOH")) { // In this case we can only set the modification location later, since we do not know the sequence length yet!! Modification ctermMod = ModificationFactory.getModification(cterm, Modification.CTERMINUS, -1); if (ctermMod == null) { throw new IllegalArgumentException("C-terminal modification code '" + cterm + "' was not recognized for the C-terminus by the ModificationFactory!"); } modifications.add(ctermMod); } // Now cycle the sequence itself. Modifications are flagged by the presence of '<>' around the code. int start = -1; // Create a StringBuffer to both found and excise the modifications, // as well as having a cleaned-up String at the end. StringBuffer sequenceRoller = new StringBuffer(sequence); while ((start = sequenceRoller.indexOf("<")) >= 0) { // Find the end of the modification. int end = sequenceRoller.indexOf(">"); // See if it al makes sense. if (end <= start) { throw new RuntimeException("Parsing failed miserably! Found a closing '>' (at " + end + ") BEFORE the opening '<' (at " + start + ") while attempting to parse modifications from: '" + sequenceRoller + "' (originally: '" + sequence + "')!"); } // We've got a modification code now, let's also get the residue it applies to (because the // ModificationFactory requires it)! String modificationCode = sequenceRoller.substring(start + 1, end); // Check whether we actually have a previous index (if we don't, flag en error!!). if (start == 0) { throw new RuntimeException("First modification ('" + modificationCode + "') in the sequence was found at index O!"); } // Okay, there should be something; garb it! String residue = sequenceRoller.substring(start - 1, start); // The location must not be forgotten! Fortunately, the location is the 'start-1' index (+1 because '0' is the N-terminus!!!) // since we've deleted all previous modifications (so the String 'to the left' of the '<xxx>' is pure sequence). Modification mod = ModificationFactory.getModification(modificationCode, residue, start); // Check whether we identified and obtained a modification. if (mod == null) { throw new IllegalArgumentException("Modification code '" + modificationCode + "' was not recognized for residue '" + residue + "' by the ModificationFactory!"); } // Add this modification to the list. modifications.add(mod); // To close the loop, delete this modification from the StringBuffer. sequenceRoller.delete(start, end + 1); } // Now we should have a set of modifications for this class, // all that remains is checking and then initializing the sequence proper. if (sequenceRoller.indexOf(">") < 0 && sequenceRoller.indexOf("<") < 0) { // No more modifications present. That's good. this.iSequence = sequenceRoller.toString(); // Add the mods as well. Iterator it = modifications.iterator(); while (it.hasNext()) { // Remember that we had to wait with the location setting of the C-terminal modification (if any) // until we knew the sequence length? // If we encounter it here, set it; now we can! Modification lModification = (Modification) it.next(); if (lModification.getLocation() == -1) { lModification.setLocation(iSequence.length() + 1); } this.addModification(lModification); } } else { throw new IllegalArgumentException("Remaining '<' or '>' in the sequence '" + sequenceRoller + "', hinting at unbalanced modification brackets!"); } } /** * This method reports on the length of the current sequence. * * @return int with the length of the sequence. */ public int getLength() { return this.iSequence.length(); } /** * This method gets the GRAVY score (Kyte & Doolittle) from * the cache, or, if it isn't cached, reconstructs it. * * @return double with the GRAVY coefficient. */ public double getGravy() { if (iGravyCached) { // Cached. Do nothing. } else { iGravy = this.calculateScore(GRAVY); iGravyCached = true; } return iGravy; } /** * This method will return an estimated 'net' HPLC retention * time for the sequence based on the table by Meek.<br> * It does NOT take a t0 value, specific to a setup, into account. * * @return double with the 'net' HPLC retention time as calculated * from Meek's table. */ public double getMeek() { if (iMeekCached) { // Cached, do nothing. } else { iMeek = this.calculateScore(MEEK); iMeekCached = true; } return iMeek; } /** * This method will return the sequence with annotated modifications. * For this annotation the key of the modifications will be used. * If the key is a formula, it will be enclosed in '<>' as well. * * @return String with the annotated sequence (i.e.: containing the * modifications. */ public String getModifiedSequence() { String result = null; /// First check if any modifications are present. if (iModifications == null) { result = "NH2-" + this.getSequence() + "-COOH"; } else { StringBuffer tempSeq = new StringBuffer(""); // Cycle the sequence, check for mods for each location in the // sequence. A Vector[] will hold the mods for each position // once we're done. int liSeqLength = iSequence.length(); Vector[] mods = new Vector[liSeqLength + 2]; for (int i = 0; i < liSeqLength + 2; i++) { mods[i] = new Vector(2, 2); } int liSize = iModifications.size(); for (int i = 0; i < liSize; i++) { ModificationImplementation m = (ModificationImplementation) iModifications.elementAt(i); int loc = m.getLocation(); mods[loc].add(m); } // Now, we've got an array of Vectors, each holding the mods for // each location in the sequence (including '0' and 'length+1' for // the N-terminus and C-terminus, respectively). // We'll cycle the sequence, add each character to the StringBuffer // and append to that character all mods (if any). // Nterm first. if (mods[0].size() > 0) { int liTemp = mods[0].size(); Collections.sort(mods[0]); for (int i = 0; i < liTemp; i++) { ModificationImplementation tempMod = (ModificationImplementation) mods[0].get(i); tempSeq.append(tempMod.getCode()); } } else { tempSeq.append("NH2"); } tempSeq.append("-"); // 'Real' sequence. for (int i = 0; i < liSeqLength; i++) { tempSeq.append(iSequence.charAt(i)); int liTemp = mods[i + 1].size(); if (liTemp > 0) { Collections.sort(mods[i + 1]); for (int j = 0; j < liTemp; j++) { ModificationImplementation tempMod = (ModificationImplementation) mods[i + 1].get(j); tempSeq.append("<" + tempMod.getCode() + ">"); } } } tempSeq.append("-"); //C-term last. if (mods[liSeqLength + 1].size() > 0) { int liTemp = mods[liSeqLength + 1].size(); Collections.sort(mods[liSeqLength + 1]); for (int i = 0; i < liTemp; i++) { ModificationImplementation tempMod = (ModificationImplementation) mods[liSeqLength + 1].get(i); tempSeq.append(tempMod.getCode()); } } else { tempSeq.append("COOH"); } result = tempSeq.toString(); } // Voila. return result; } /** * This method will return an AASequenceImpl that represents * an N-terminal truncation of the current sequence. <br> * Note that the applicable modifications (those within the truncation size) * are also represented in the truncated sequence! * * @param aTruncationSize int with the amount of N-terminal residues the * truncated sequence should have. * @return AASEquenceImpl with the N-terminal truncated sequence (including modifications). */ public AASequenceImpl getNTermTruncatedSequence(int aTruncationSize) { AASequenceImpl result = null; if (aTruncationSize >= this.getLength()) { result = new AASequenceImpl(this.iSequence); } else { result = new AASequenceImpl(this.iSequence.substring(0, aTruncationSize)); // See if there are modifications, and if so, handle them. if (iModifications != null) { int liSize = iModifications.size(); Vector mods = new Vector(10, 5); for (int i = 0; i < liSize; i++) { // If the modification applies to a position that falls // within the new truncated size, take it with us. // Else, leave it be. ModificationImplementation m = (ModificationImplementation) iModifications.get(i); if (m.getLocation() <= aTruncationSize) { mods.add(m); } } if (mods.size() > 0) { result.setModifications(mods); } } } return result; } /** * This method will return an AASequenceImpl that represents * a C-terminal truncation of the current sequence. <br> * Note that the applicable modifications (those within the truncation size) * are also represented in the truncated sequence! * * @param aTruncationSize int with the amount of C-terminal residues the * truncated sequence should have. * @return AASEquenceImpl with the C-terminal truncated sequence (including modifications). */ public AASequenceImpl getCTermTruncatedSequence(int aTruncationSize) { AASequenceImpl result = null; if (aTruncationSize >= this.getLength()) { result = new AASequenceImpl(this.iSequence); } else { result = new AASequenceImpl(this.iSequence.substring(this.iSequence.length() - aTruncationSize, this.iSequence.length())); // See if there are modifications, and if so, handle them. if (iModifications != null) { int liSize = iModifications.size(); Vector mods = new Vector(10, 5); for (int i = 0; i < liSize; i++) { // If the modification applies to a position that falls // within the new truncated size, take it with us. // Else, leave it be. ModificationImplementation m = (ModificationImplementation) iModifications.get(i); if (m.getLocation() >= (this.iSequence.length() - aTruncationSize)) { // Recalculate the correct location for the mdoification. Modification m2 = (Modification) m.clone(); m2.setLocation(m.getLocation() - (this.iSequence.length() - aTruncationSize)); mods.add(m2); } } if (mods.size() > 0) { result.setModifications(mods); } } } return result; } /** * This method will return an AASequenceImpl that represents * an internal truncation of the current sequence. <br > * Note that the applicable modifications (those within the truncation size) * are also represented in the resulting truncated sequence! * * @param aStart int with the start (N-terminal) residue for the * truncation. The first residue is number '1'. * @param aEnd int with the end residue (C-terminal; NOT included) for the truncation. * @return AASEquenceImpl with the C-terminal truncated sequence (including modifications). */ public AASequenceImpl getTruncatedSequence(int aStart, int aEnd) { AASequenceImpl result = null; if (aStart <= 0 && aEnd >= this.getLength()) { result = new AASequenceImpl(this.iSequence); } else { result = new AASequenceImpl(this.iSequence.substring(aStart - 1, aEnd - 1)); // See if there are modifications, and if so, handle them. if (iModifications != null) { int liSize = iModifications.size(); Vector mods = new Vector(10, 5); for (int i = 0; i < liSize; i++) { // If the modification applies to a position that falls // within the new truncated size, take it with us. // Else, leave it be. ModificationImplementation m = (ModificationImplementation) iModifications.get(i); if (m.getLocation() >= aStart && m.getLocation() < aEnd) { Modification m2 = (Modification) m.clone(); m2.setLocation(m.getLocation() - (aStart - 1)); mods.add(m2); } else if (m.getLocation() == 0 && aStart <= 1) { // Add the N-term modification. mods.add(m); } else if (m.getLocation() == this.iSequence.length() + 1 && aEnd >= this.iSequence.length() + 1) { // Add the C-term modification. Modification m2 = (Modification) m.clone(); m2.setLocation(m.getLocation() - (aStart - 1)); mods.add(m2); } } if (mods.size() > 0) { result.setModifications(mods); } } } return result; } /** * This method reports whether a certain residu (or fixed sequence * String) is found in the current sequence. * * @param aSequence with the residu (or fixed sequence of residus) * to find in the current sequence. * @return boolean that indicates whether the sequence contains * the indicated residu (or sequence). */ public boolean contains(String aSequence) { boolean result = false; if (this.iSequence.indexOf(aSequence) >= 0) { result = true; } return result; } /** * This method adds a modification to the list of modifications. * * @param aModification Modification instance to add to the modifications list. */ public void addModification(Modification aModification) { if (this.iModifications == null) { iModifications = new Vector(this.iSequence.length() + 2, 2); } this.iModifications.add(aModification); // Undo the cache. this.iMass = -1.0; } /** * Calculation consists of adding all the coefficients for the * recognized AA, and then dividing the result by the number of * additions done (== the number of AA recognized == the number * of AA for which we have a score). This is a simple mean, btw. * * @param aList int with the list to use (use only final * member vars of this class!). * @return double with the score for the sequence. */ private double calculateScore(int aList) { double temp = 0.0; Properties tempList = new Properties(); switch (aList) { case GRAVY: if (iKyte_Doolittle == null) { iKyte_Doolittle = this.loadProps("kyte_doolittle.properties"); } tempList = iKyte_Doolittle; break; case MEEK: if (iMeekList == null) { iMeekList = this.loadProps("meek.properties"); } tempList = iMeekList; break; } // Cycle the sequence, add each known AA and // divide by the number of additions. int additions = 0; for (int i = 0; i < iSequence.length(); i++) { String key = Character.toString(iSequence.charAt(i)); double index = 0.0; if (tempList.containsKey(key)) { index = Double.parseDouble(tempList.getProperty(key)); additions++; } temp += index; } // Division step. if (additions == 0) { additions++; } temp /= additions; BigDecimal bd = new BigDecimal(temp); bd = bd.setScale(3, BigDecimal.ROUND_HALF_EVEN); return bd.doubleValue(); } /** * This method calculates the molecular formula based on the sequence * @return MolecularFormula */ public MolecularFormula getMolecularFormula(){ MolecularFormula lResult = new MolecularFormula(this); return lResult; } /** * This method gives the IsotopicDistribution for the sequence * @return IsotopicDistribution */ public IsotopicDistribution getIsotopicDistribution(){ MolecularFormula lForm = getMolecularFormula(); IsotopicDistribution lCalc = new IsotopicDistribution(lForm); return lCalc; } /** * This method loads a Properties instance from the classpath. * It returns an empty instance and displays an error message * if the Properties instance was not found. * * @param aPropFileName String with the filename for the * properties file. * @return Properties with the props from the file, or an empty * instance if the file was not found. */ private Properties loadProps(String aPropFileName) { Properties p = new Properties(); try { p.load(this.getClass().getClassLoader().getResourceAsStream(aPropFileName)); } catch (IOException ioe) { logger.error("\nProperties file (" + aPropFileName + ") not found in classpath!"); logger.error("All resultant values will be computed to 0.0!!\n"); } return p; } }