/* * Copyright (C) Lennart Martens * * Contact: lennart.martens AT UGent.be (' AT ' to be replaced with '@') */ /** * Created by IntelliJ IDEA. * User: Lennart * Date: 27-sep-2003 * Time: 20:51:50 */ package com.compomics.util.protein; import org.apache.log4j.Logger; import java.util.*; /* * CVS information: * * $Revision: 1.3 $ * $Date: 2007/07/06 09:41:54 $ */ /** * This class implements an enzyme with a dual specificity; the N-terminus of a * resultant peptide will have certain residue, the C-terminus will have another, * eg. for a C-terminal cutter with N-terminal specificty for 'D' and C-terminal specificity * for 'R': (D)XXXXXR; for an N-terminal cutter with N-terminal specificty for 'W' and C-terminal specificity * for 'K': WXXXXX(K). * * @author Lennart Martens */ public class DualEnzyme extends Enzyme { // Class specific log4j logger for DualEnzyme instances. static Logger logger = Logger.getLogger(DualEnzyme.class); /** * The HashMap with the cleavables for the N-terminal side of the * resultant peptide. */ private HashMap iNtermCleavables = null; /** * The HashMap with the cleavables for the C-terminal side of the * resultant peptide. */ private HashMap iCtermCleavables = null; /** * The code for N-terminal position in the resultant peptide. */ public static final int NTERMINAL = 0; /** * The code for C-terminal position in the resultant peptide. */ public static final int CTERMINAL = 1; /** * This constructor allows you to specify all the information for this * enzyme plus the number of missed cleavages that this instance will allow. * Title and restrict can be 'null'. * * @param aTitle String with the title (or name) for this enzyme. * @param aNtermCleavage String composed of the residues after which cleavage * will occur at the N-terminus of the resultant peptide * (this String will be uppercased). * @param aCtermCleavage String composed of the residues after which cleavage * will occur at the C-terminus of the resultant peptide * (this String will be uppercased). * @param aRestrict String composed of the residues which inhibit cleavage * if present behind any of the cleavable residues (this String will be uppercased). * @param aPosition String which should correspond to "Cterm" or "Nterm" * for each position respectively. * @param aMiscleavages int with the number of supported missed cleavages. */ public DualEnzyme(String aTitle, String aNtermCleavage, String aCtermCleavage, String aRestrict, String aPosition, int aMiscleavages) { super(aTitle, "", aRestrict, aPosition, aMiscleavages); this.setCleavage(aNtermCleavage, DualEnzyme.NTERMINAL); this.setCleavage(aCtermCleavage, DualEnzyme.CTERMINAL); } /** * This constructor allows you to specify all the information for this * enzyme. Title and restrict can be 'null'. * * @param aTitle String with the title (or name) for this enzyme. * @param aNtermCleavage String composed of the residues after which cleavage * will occur at the N-terminus of the resultant peptide * (this String will be uppercased). * @param aCtermCleavage String composed of the residues after which cleavage * will occur at the C-terminus of the resultant peptide * (this String will be uppercased). * @param aRestrict String composed of the residues which inhibit cleavage * if present behind any of the cleavable residues (this String will be uppercased). * @param aPosition String which should correspond to "Cterm" or "Nterm" * for each position respectively. */ public DualEnzyme(String aTitle, String aNtermCleavage, String aCtermCleavage, String aRestrict, String aPosition) { super(aTitle, "", aRestrict, aPosition); this.setCleavage(aNtermCleavage, DualEnzyme.NTERMINAL); this.setCleavage(aCtermCleavage, DualEnzyme.CTERMINAL); } /** * This method allows the caller to specify the cleavable residues. * * @param aCleavage char[] with the cleavable residues * (in <b>UPPER CASE</b>!). */ public void setCleavage(char[] aCleavage) { super.setCleavage(aCleavage); if(aCleavage != null) { iNtermCleavables = new HashMap(aCleavage.length); iCtermCleavables = new HashMap(aCleavage.length); for(int i = 0; i < aCleavage.length; i++) { iNtermCleavables.put(Character.valueOf(aCleavage[i]), "1"); iCtermCleavables.put(Character.valueOf(aCleavage[i]), "1"); } } else { iNtermCleavables = new HashMap(); iCtermCleavables = new HashMap(); } } /** * This method allows the caller to specify the cleavable residues. * They will be read from the String as a continuous summation of * characters (i.e: 'RKGH'). * * @param aCleavage String with the continuous characters * corresponding to the cleavable residues. * Note that the String is uppercased. */ public void setCleavage(String aCleavage) { char[] temp = null; if(aCleavage != null) { temp = aCleavage.toUpperCase().toCharArray(); } this.setCleavage(temp); } /** * This method allows the caller to specify the cleavable residues. * * @param aCleavage char[] with the cleavable residues * (in <b>UPPER CASE</b>!). * @param aTerminus int with the code for the terminal position in * the resultant peptide. Can be 'NTERMINAL' or 'CTERMINAL'. */ public void setCleavage(char[] aCleavage, int aTerminus) { HashMap cleavables = null; if(aCleavage != null) { cleavables = new HashMap(aCleavage.length); for(int i = 0; i < aCleavage.length; i++) { cleavables.put(Character.valueOf(aCleavage[i]), "1"); } } else { cleavables = new HashMap(); } switch(aTerminus) { case NTERMINAL: iNtermCleavables = cleavables; break; case CTERMINAL: iCtermCleavables = cleavables; break; default: throw new IllegalArgumentException("You specified " + aTerminus + " as the terminus code, while it should be " + DualEnzyme.NTERMINAL + " (NTERMINAL) or " + DualEnzyme.CTERMINAL + " (CTERMINAL)!"); } } /** * This method allows the caller to specify the cleavable residus. * They will be read from the String as a continuous summation of * characters (i.e: 'RKGH'). * * @param aCleavage String with the continuous characters * corresponding to the cleavable residues. * Note that the String is uppercased. * @param aTerminus int with the code for the terminal position in * the resultant peptide. Can be 'NTERMINAL' or 'CTERMINAL'. */ public void setCleavage(String aCleavage, int aTerminus) { char[] temp = null; if(aCleavage != null) { temp = aCleavage.toUpperCase().toCharArray(); } this.setCleavage(temp, aTerminus); } /** * This method returns the residues that are used for cleavage at the respective * locations. * * @param aTerminus int with the code for the terminal position in * the resultant peptide. Can be 'NTERMINAL' or 'CTERMINAL'. * @return char[] with the cleavable residues for the specified terminus of the resultant peptide. */ public char[] getCleavage(int aTerminus) { Set keys = null; switch(aTerminus) { case NTERMINAL: keys = iNtermCleavables.keySet(); break; case CTERMINAL: keys = iCtermCleavables.keySet(); break; default: throw new IllegalArgumentException("You specified " + aTerminus + " as the terminus code, while it should be " + DualEnzyme.NTERMINAL + " (NTERMINAL) or " + DualEnzyme.CTERMINAL + " (CTERMINAL)!"); } ArrayList lList = new ArrayList(keys); Collections.sort(lList); char[] result = new char[lList.size()]; for (int i = 0; i < lList.size(); i++) { char c = ((Character) lList.get(i)).charValue(); result[i] = c; } return result; } /** * Simple getter for the cleavable residues of the Enzyme. * For a DualEnzyme, it returns [nterms]X[cterms] * * @return char[] with the cleavable residues, structured as [nterms]X[cterms]. */ public char[] getCleavage() { Set nTermKeys = iNtermCleavables.keySet(); Set cTermKeys = iCtermCleavables.keySet(); char[] result = new char[nTermKeys.size() + cTermKeys.size() + 1]; int counter = 0; Iterator iter = nTermKeys.iterator(); while(iter.hasNext()) { Character lCharacter = (Character)iter.next(); result[counter] = lCharacter.charValue(); counter++; } // Put an 'X' in between. result[counter] = 'X'; counter++; iter = cTermKeys.iterator(); while(iter.hasNext()) { Character lCharacter = (Character)iter.next(); result[counter] = lCharacter.charValue(); } return result; } /** * Provides a cloned version of this DualEnzyme. * * @return Enzyme with a clone for this DualEnzyme. */ public Object clone() { DualEnzyme de = (DualEnzyme)super.clone(); if(de != null) { de.iCtermCleavables = this.iCtermCleavables; de.iNtermCleavables = this.iNtermCleavables; } return de; } /** * This method generates a String representation of the DualEnzyme, * which is useful for displaying as useful information for the user or * during testing/debugging. * * @return String with a textual description of this Enzyme. */ public String toString() { return this.toString(""); } /** * This method generates a String representation of the DualEnzyme, * which is useful for displaying as useful information for the user or * during testing/debugging. It takes a parameter String that is prepended to each line. * * @param aPrepend String to prepend to each outputted line. * @return String with a textual description of this DualEnzyme. */ public String toString(String aPrepend) { StringBuffer result = new StringBuffer("\n" + aPrepend + "Hi, I'm the DualEnzyme '" + this.getTitle() + "'.\n"); result.append(aPrepend + "I cleave at the sight of:\n"); result.append(aPrepend + "\t- Nterminal: '" + new String(this.getCleavage(DualEnzyme.NTERMINAL)) + "'.\n"); result.append(aPrepend + "\t- Cterminal: '" + new String(this.getCleavage(DualEnzyme.CTERMINAL)) + "'.\n"); if(this.getRestrict() != null && this.getRestrict().length > 0) { result.append(aPrepend + "My activity is restricted by these residus: '" + new String(this.getRestrict()) + "'.\n"); } else { result.append(aPrepend + "There are no residus that restrict my activity.\n"); } result.append(aPrepend + "My position is '" + ((this.getPosition() == Enzyme.CTERM)?"C-terminal":"N-terminal") + "'.\n"); result.append(aPrepend + "I currently allow " + ((this.getMiscleavages() == 0)?"no":"up to " + this.getMiscleavages()) + " missed cleavage" + ((this.getMiscleavages() == 1)?"":"s") + ".\n"); return result.toString(); } /** * This cleave method will process sequence XDYRZ solely into * YR peptides. * * @param aProtein Protein instance to cleave. * @return Protein[] with the resultant peptides. */ public Protein[] oldCleave(Protein aProtein) { // Final result. Protein[] result = null; // Intermediate result. ArrayList proteins = new ArrayList(); // Get the header. Header header = aProtein.getHeader(); // Get the sequence as String and as char[]. String seqString = aProtein.getSequence().getSequence(); char[] sequence = seqString.toCharArray(); // Get the start location. int start = aProtein.getHeader().getStartLocation()-1; // If there was no start location known, set it to be '0'. if(start<0) { start = 0; } // Cycle each char in the String. for(int i = 0; i < sequence.length; i++) { if(this.isCleavable(sequence, i, iNtermCleavables, iRestrictors)) { // In getting here, we can be sure that this residue is a starting // cleavage site. int init = i; // Now find the peptides we can construct from here on, // with the specified number of missed cleavages. int countMC = 0; for(int tryout=i;tryout<sequence.length;tryout++) { if(countMC > iMiscleavages) { break; } // Correct cleavage site is detected if: // - it really is a cleavable residue without a restrictor. // - it is the C-terminus of the sequence, in which case it is also correct. if(this.isCleavable(sequence, tryout, iCtermCleavables, iRestrictors) || tryout == (sequence.length-1)) { // Add one to the missed cleavage counter. countMC++; // Create a new Protein based on the info we now have. Header h = (Header)header.clone(); int tempStart = -1; int tempStop = -1; // We need to recalculate the start and stop indices, depending on existing start and // stop indices and N-terminal or C-terminal cleavage. if(this.iPosition == Enzyme.CTERM) { // Take the part, starting from init+1 up to and including the current // as a new peptide and store it in the interMed Vector. tempStart = init+1; tempStop = tryout+1; } else if(this.iPosition == Enzyme.NTERM){ tempStart = init; tempStop = tryout; // If it was the C-terminus, it should be included since it is not enzymatic and // therefore not N-terminally cleaved off! if(tryout == (sequence.length-1)) { tempStop++; } } // Human readable start and stop in the header, hence the '+1'. h.setLocation(start+tempStart+1, start+tempStop); proteins.add(new Protein(h, new AASequenceImpl(seqString.substring(tempStart, tempStop)))); } } } } // Now transform the ArrayList into an array. result = new Protein[proteins.size()]; proteins.toArray(result); // That's it. return result; } /** * This method is the focus of the Enzyme instance. It can perform * an <i>in-silico</i> digest of a Protein sequence according to the * specifications detailed in the construction or via the setters. * * @param aProtein Protein instance to cleave. * @return Protein[] with the resultant peptides. */ public Protein[] cleave(Protein aProtein) { // Final result. Protein[] result = null; // Intermediate result. ArrayList proteins = new ArrayList(); // Get the header. Header header = aProtein.getHeader(); // Get the sequence as String and as char[]. String seqString = aProtein.getSequence().getSequence(); char[] sequence = seqString.toCharArray(); // Get the start location. int start = aProtein.getHeader().getStartLocation()-1; // If there was no start location known, set it to be '0'. if(start<0) { start = 0; } // Previous C-term cleavage position (without taking miscleavages into account). int previousCtermCleavagePosition = 0; // Previous N-term cleavage position (without taking miscleavages into account). int previousNtermCleavagePosition = 0; // Cycle each char in the String. for(int i = 0; i < sequence.length; i++) { if(this.isCleavable(sequence, i, iNtermCleavables, iRestrictors)) { // In getting here, we can be sure that this residue is a starting // cleavage site. int init = i; // Thus, the peptide from the previous C-terminal cleavage residue up // to this one (for C-term cleavage, including this one), is also an // enzymatic peptide. We'll process it here separately. int endLoc = -1; if(this.iPosition == Enzyme.CTERM) { endLoc = i+1; } else if(this.iPosition == Enzyme.NTERM) { endLoc = i; } // The starting location of this 'intermediate' peptide depends on the residue // order. If the previous C-terminal cleavage is greater than the current N-terminal position // (as in FGDHVDGHRTS, for instance), we should take the previous N-terminal position as starting // point. int startLoc = previousCtermCleavagePosition; if(previousCtermCleavagePosition > init) { startLoc = previousNtermCleavagePosition; } Header intermed = (Header)header.clone(); // The header locations are defined to be human-readable (ie., start from '1'). // Therefore the start location is augmented with 1, the end location (which in Java is not inclusive) // does not need to be augmented. intermed.setLocation(start + startLoc + 1, start + endLoc); proteins.add(new Protein(intermed, new AASequenceImpl(seqString.substring(startLoc, endLoc)))); // Set the previous N-term cleavage position. if(this.iPosition == Enzyme.CTERM) { previousNtermCleavagePosition = init+1; } else if(this.iPosition == Enzyme.NTERM) { previousNtermCleavagePosition = init; } // Now find the peptides we can construct from here on, // with the specified number of missed cleavages. int countMC = 0; for(int tryout=i;tryout<sequence.length;tryout++) { if(countMC > iMiscleavages) { break; } // Correct cleavage site is detected if: // - it really is a cleavable residue without a restrictor. // - it is the C-terminus of the sequence, in which case it is also correct. if(this.isCleavable(sequence, tryout, iCtermCleavables, iRestrictors) || tryout == (sequence.length-1)) { // Add one to the missed cleavage counter. countMC++; // Create a new Protein based on the info we now have. Header h = (Header)header.clone(); int tempStart = -1; int tempStop = -1; // We need to recalculate the start and stop indices, depending on existing start and // stop indices and N-terminal or C-terminal cleavage. if(this.iPosition == Enzyme.CTERM) { // Take the part, starting from init+1 up to and including the current // as a new peptide and store it in the interMed Vector. tempStart = init+1; tempStop = tryout+1; } else if(this.iPosition == Enzyme.NTERM){ tempStart = init; tempStop = tryout; // If it was the C-terminus, it should be included since it is not enzymatic and // therefore not N-terminally cleaved off! if(tryout == (sequence.length-1)) { tempStop++; } } // Extra check; if we have a cleavable at the C-terminus, we'll have an // empty String for a peptide. Which isn't good. Skip this. if(tempStart == tempStop) { continue; } // If this is the first cleavage site (at this point countMC == 1) // then store the current C-terminal cleavage location for use with // the next intermediate peptide. if(countMC == 1) { previousCtermCleavagePosition = tempStop; } // The header locations are defined to be human-readable (ie., start from '1'). // Therefore the start location is augmented with 1, the end location (which in Java is not inclusive) // does not need to be augmented. h.setLocation(start+tempStart+1, start+tempStop); proteins.add(new Protein(h, new AASequenceImpl(seqString.substring(tempStart, tempStop)))); } } } } // Before we finalize the cleaving, we should probably check for the C-terminus of the protein, // since this is probably not cleaved up till now. if(previousCtermCleavagePosition < sequence.length) { Header h = (Header)header.clone(); h.setLocation(start + previousCtermCleavagePosition + 1, start + sequence.length); proteins.add(new Protein(h, new AASequenceImpl(seqString.substring(previousCtermCleavagePosition, sequence.length)))); } // Now transform the ArrayList into an array. result = new Protein[proteins.size()]; proteins.toArray(result); // That's it. return result; } /** * This method reports on the possibility that the presented subsequence * (represented by the start and end location in the parent) is the result * of enzymatic activity. * Returning int values can be checked against public static final vars on this class. * * @param aParentSequence String with the parent sequence * @param aStart int with the start of the subsequence relative to the parent (first residue is '1'). * @param aEnd int with the end of the subsequence relative to the parent * @return int with the coded possibility (1 = Full enzymatic product, * 2 = N-terminal half enzymatic product, 3 = C-terminal half * enzymatic product and 4 = Entirely not an enzymatic product. */ public int isEnzymaticProduct(String aParentSequence, int aStart, int aEnd) { int result = 0; // Correction for human-readable indices. aStart--; aEnd--; // Check validity of parameters. if((aStart < 0) || (aEnd <0)) { throw new IllegalArgumentException("Subsequence is not a subsequence of the parent!"); } if(aEnd > aParentSequence.length()-1) { throw new IllegalArgumentException("Subsequence end index out of parent length range (" + aEnd + ">" + (aParentSequence.length()-1) + ")!"); } if(aStart >= aEnd) { throw new IllegalArgumentException("Subsequence could not be retreived since start index is greater than or equal to end index (" + aStart + ">=" + aEnd + ")!"); } // The maximum length of the sequence. int maxLength = aParentSequence.length(); if(this.getPosition() == Enzyme.CTERM) { // First check N-terminal side. if((aStart-1) >= 0) { Character residue = Character.valueOf(aParentSequence.charAt(aStart-1)); Character possRestrict = Character.valueOf(aParentSequence.charAt(aStart)); if((this.iNtermCleavables.get(residue) != null) && (this.iRestrictors.get(possRestrict) == null)) { result += 1; } } else { // It is the N-terminus of the parent. This checks out. result += 1; } // Now C-terminal side. Character residue = Character.valueOf(aParentSequence.charAt(aEnd)); if((this.iCtermCleavables.get(residue) != null) || ((aEnd+1) == maxLength)) { if((aEnd+1) < maxLength) { if(this.iRestrictors.get(Character.valueOf(aParentSequence.charAt(aEnd+1))) == null) result += 2; } else { // The cleavage site appears to be the C-terminal residue in // the parent sequence. This qualifies. result += 2; } } } else { // First check N-terminal side. Character residue = Character.valueOf(aParentSequence.charAt(aStart)); if((iNtermCleavables.get(residue) != null)) { // The site is potentially cleavable. // What about restriction residues? if( ((aStart+1) < maxLength) && (iRestrictors.get(Character.valueOf(aParentSequence.charAt(aStart+1))) != null) ) { // Do nothing, since it is not a site. } else { // It is a true site. result += 1; } } else if(aStart == 0) { // It is the N-terminus. Validate it. result += 1; } // Now C-terminal side. if((aEnd+1) < maxLength) { Character residue2 = Character.valueOf(aParentSequence.charAt(aEnd+1)); if(this.iCtermCleavables.get(residue2) != null) { if((aEnd+2) < maxLength) { if(this.iRestrictors.get(Character.valueOf(aParentSequence.charAt(aEnd+2))) == null) result += 2; } else { // The cleavage site appears to be the C-terminal residue in // the parent sequence. This qualifies. result += 2; } } } else if(aEnd == (maxLength-1)) { // The C-terminus of the subsequence is the C-terminus of the parent. // This qualifies. result += 2; } } // That's it. Let's have a look, shall we? switch(result) { case 0: // Neither of the termini checked out. result = Enzyme.ENTIRELY_NOT_ENZYMATIC; break; case 1: // Only N-term checked out. result = Enzyme.N_TERM_ENZYMATIC; break; case 2: // Only C-tyerm checked out. result = Enzyme.C_TERM_ENZYMATIC; break; case 3: // Both N- and C-term checked out. result = Enzyme.FULLY_ENZYMATIC; break; default: throw new RuntimeException("A number larger than 3 has been calculated for the 'enzymaticness' of a peptide."); } return result; } /** * This method checks whether a certain residue is cleavable, based on the sequence it is in, * the position the residue is at and a Map of cleavable residues and restrictors. * * @param aSequence char[] with the sequence. * @param aPosition int with the position of the residue in the char[]. * @param aCleavables HashMap with the cleavables as Characters as keys. * @param aRestrictors HashMap with the restrictors as Characters as keys. * @return boolean 'true' when the residue is considered a cleavage site, * 'false' otherwise. */ private boolean isCleavable(char[] aSequence, int aPosition, HashMap aCleavables, HashMap aRestrictors) { boolean cleavable = false; // Check params. if(aPosition >= aSequence.length || aPosition < 0) { throw new IllegalArgumentException("Your position (" + aPosition + ") was outside of sequence boundaries (0, " + (aSequence.length-1) + ")!"); } // Okay, that checks out. // Now we first see whether this position really can be cleaved. if(aCleavables.containsKey(Character.valueOf(aSequence[aPosition]))) { // Okay, it could possibly be a cleavage site. // See if it has a C-terminal residue, and whether it is a restrictor. if( (aPosition+1 < aSequence.length) && aRestrictors.containsKey(Character.valueOf(aSequence[aPosition+1])) ) { // It is a restrictor! cleavable = false; } else { cleavable = true; } } return cleavable; } /** * Tests the DualEnzyme by digesting a hardcoded protein. * * @param args the arguments */ public static void main(String[] args) { Enzyme dual = new DualEnzyme("TestDualEnzyme", "D", "R", "P", "Cterm", 0); Protein[] p = dual.cleave(new Protein(">sw|Q55645 (15-45)|TEST_HUMAN Test Protein for the cleave() method.", "FGHDKLMDTGKRVWRGHF")); for(int i = 0; i < p.length; i++) { Protein lProtein = p[i]; logger.info(lProtein.getHeader().getFullHeaderWithAddenda()); logger.info(lProtein.getSequence().getSequence()); } } }