/* * Copyright (C) Lennart Martens * * Contact: lennart.martens AT UGent.be (' AT ' to be replaced with '@') */ /* * Created by IntelliJ IDEA. * User: Lennart * Date: 8-okt-02 * Time: 18:38:53 */ package com.compomics.util.protein; import java.util.ArrayList; import org.apache.log4j.Logger; import java.util.HashMap; import java.util.HashSet; import java.util.Vector; /* * CVS information: * * $Revision: 1.5 $ * $Date: 2008/11/18 11:39:11 $ */ /** * This class implements the functionality for an Enzyme. * * @author Lennart Martens * @author Harald Barsnes * @author MArc Vaudel */ public class Enzyme implements Cloneable { // Class specific log4j logger for Enzyme instances. Logger logger = Logger.getLogger(Enzyme.class); public static final int CTERM = 0; // @TODO: should be replaced by Emnum public static final int NTERM = 1; // @TODO: should be replaced by Emnum public static final int FULLY_ENZYMATIC = 1; // @TODO: should be replaced by Emnum public static final int N_TERM_ENZYMATIC = 2; // @TODO: should be replaced by Emnum public static final int C_TERM_ENZYMATIC = 3; // @TODO: should be replaced by Emnum public static final int ENTIRELY_NOT_ENZYMATIC = 4; // @TODO: should be replaced by Emnum /** * This String holds the title (or name) for the enzyme. */ protected String iTitle = null; /** * This char[] holds the residues after which cleavage will occur. */ protected char[] iCleavage = null; /** * Lookup cache for the cleavable residues. */ protected HashMap iCleavables = null; /** * This char[] holds the residues that will restrict cleavage when present * after a cleavable residue. */ protected char[] iRestrict = null; /** * Lookup cache for the restricting residues. */ protected HashMap iRestrictors = null; /** * This integer holds the position marker for the cleavage direction for * this Enzyme. This variable can be matched against the constants defined * on this class. */ protected int iPosition = -1; /** * This variable holds the number of supported missed cleavages. */ protected int iMiscleavages = 0; /** * This constructor requires that you specify all the information for this * enzyme. Title and restrict can be 'null', and the number of miscleavages * is defaulted to 1. * * @param aTitle String with the title (or name) for this enzyme. * @param aCleavage String composed of the residues after which cleavage * will occur. * @param aRestrict String composed of the residues which inhibit cleavage * if present behind of cleavable residues. * @param aPosition String which should correspond to "Cterm" or "Nterm" for * each position respectively. */ public Enzyme(String aTitle, String aCleavage, String aRestrict, String aPosition) { this(aTitle, aCleavage, aRestrict, aPosition, 1); } /** * This constructor allows you to specify all the information for this * enzyme plus the number of missed cleavages that this instance will allow. * Title and restrict can be 'null'. * * @param aTitle String with the title (or name) for this enzyme. * @param aCleavage String composed of the residues after which cleavage * will occur (this String will be uppercased). * @param aRestrict String composed of the residues which inhibit cleavage if * present behind of cleavable residues (this String will be uppercased). * @param aPosition String which should correspond to "Cterm" or "Nterm" for * each position respectively. * @param aMiscleavages integer with the number of supported missed cleavages. */ public Enzyme(String aTitle, String aCleavage, String aRestrict, String aPosition, int aMiscleavages) { iTitle = aTitle; this.setCleavage(aCleavage); this.setRestrict(aRestrict); aPosition = aPosition.trim(); if (aPosition.equalsIgnoreCase("Cterm")) { iPosition = CTERM; } else if (aPosition.equalsIgnoreCase("Nterm")) { iPosition = NTERM; } else { throw new IllegalArgumentException("I only understand the positions 'Nterm' or 'Cterm'! You passed: '" + aPosition + "'."); } iMiscleavages = aMiscleavages; } /** * Creates a new Enzyme from a com.compomics.util.experiment.biology.Enzyme * enzyme and the maximum number of missed cleavages. * * @param enzyme The com.compomics.util.experiment.biology.Enzyme enzyme * @param maxMissedCleavages The maximum number of missed cleavages */ public Enzyme(com.compomics.util.experiment.biology.Enzyme enzyme, int maxMissedCleavages) { String position = "", cleavage = "", restrict = ""; if (enzyme.getAminoAcidBefore().size() > 0) { position = "Cterm"; HashSet<Character> temp = enzyme.getAminoAcidBefore(); for (Character aa : temp) { cleavage += aa; } temp = enzyme.getRestrictionAfter(); for (Character aa : temp) { restrict += aa; } } else { position = "Nterm"; HashSet<Character> temp = enzyme.getAminoAcidAfter(); for (Character aa : temp) { cleavage += aa; } temp = enzyme.getRestrictionAfter(); for (Character aa : temp) { restrict += aa; } } iTitle = enzyme.getName(); this.setCleavage(cleavage); this.setRestrict(restrict); position = position.trim(); if (position.equalsIgnoreCase("Cterm")) { iPosition = CTERM; } else if (position.equalsIgnoreCase("Nterm")) { iPosition = NTERM; } else { throw new IllegalArgumentException("I only understand the positions 'Nterm' or 'Cterm'! You passed: '" + position + "'."); } iMiscleavages = maxMissedCleavages; } /** * Simple getter for the title (name) of the Enzyme. * * @return String with the title (name). */ public String getTitle() { return iTitle; } /** * This method allows the caller to change the title (name) of the Enzyme. * * @param aTitle String with the title (name) for the Enzyme. */ public void setTitle(String aTitle) { iTitle = aTitle; } /** * Simple getter for the cleavagable residues of the Enzyme. * * @return char[] with the cleavable residues. */ public char[] getCleavage() { return iCleavage; } /** * This method allows the caller to specify the cleavable residues. * * @param aCleavage char[] with the cleavable residues (in <b>UPPER * CASE</b>!). */ public void setCleavage(char[] aCleavage) { iCleavage = aCleavage; if (iCleavage != null) { iCleavables = new HashMap(this.iCleavage.length); for (int i = 0; i < iCleavage.length; i++) { iCleavables.put(Character.valueOf(iCleavage[i]), "1"); } } else { iCleavables = new HashMap(); } } /** * This method allows the caller to specify the cleavable residues. They * will be read from the String as a continuous summation of characters * (i.e: 'RKGH'). * * @param aCleavage String with the continuous characters corresponding to * the cleavable residues. Note that the String is uppercased. */ public void setCleavage(String aCleavage) { char[] temp = null; if (aCleavage != null) { temp = aCleavage.toUpperCase().toCharArray(); } this.setCleavage(temp); } /** * Simple getter for the restricting residues of the Enzyme. * * @return char[] with the restricting residues. */ public char[] getRestrict() { return iRestrict; } /** * This method allows the caller to specify the residues that restrict * cleavage. * * @param aRestrict char[] with the residues (in <b>UPPER CASE</b>!) which * restrict cleavage. */ public void setRestrict(char[] aRestrict) { iRestrict = aRestrict; if (iRestrict != null) { iRestrictors = new HashMap(this.iRestrict.length); for (int i = 0; i < iRestrict.length; i++) { iRestrictors.put(Character.valueOf(iRestrict[i]), "1"); } } else { iRestrictors = new HashMap(); } } /** * This method allows the caller to specify the residues which restrict * cleavage. They will be read from the String as a continuous summation of * characters (i.e: 'PGHK'). * * @param aRestrict String with the continuous characters corresponding to * the restricting residues. Note that the String is uppercased. */ public void setRestrict(String aRestrict) { char[] temp = null; if (aRestrict != null) { temp = aRestrict.toUpperCase().toCharArray(); } this.setRestrict(temp); } /** * Simple getter for the cleavage position of the Enzyme. * * @return int with the coded cleavage position (to be compared with the * constants on this class). */ public int getPosition() { return iPosition; } /** * This method allows the caller to set the cleavage position for the * Enzyme. Please use the constants defined on this class as parameters. * * @param aPosition int with the coded position, according to the constants * on this class. */ public void setPosition(int aPosition) { iPosition = aPosition; } /** * Simple getter for the number of allowed missed cleavages for the Enzyme. * * @return int with the number of allowed missed cleavages. */ public int getMiscleavages() { return iMiscleavages; } /** * This method allows the caller to specify the number of allowed missed * cleavages for this enzyme. * * @param aMiscleavages int with the number of allowed missed cleavages. */ public void setMiscleavages(int aMiscleavages) { iMiscleavages = aMiscleavages; } /** * This method generates a String representation of the Enzyme, which is * useful for displaying as useful information for the user or during * testing/debugging. * * @return String with a textual description of this Enzyme. */ public String toString() { return this.toString(""); } /** * This method generates a String representation of the Enzyme, which is * useful for displaying as useful information for the user or during * testing/debugging. It takes a parameter String that is prepended to each * line. * * @param aPrepend String to prepend to each outputted line. * @return String with a textual description of this Enzyme. */ public String toString(String aPrepend) { StringBuffer result = new StringBuffer("\n" + aPrepend + "Hi, I'm the Enzyme '" + this.iTitle + "'.\n"); result.append(aPrepend + "I cleave at the sight of: '" + new String(this.iCleavage) + "'.\n"); if (this.iRestrict != null) { result.append(aPrepend + "My activity is restricted by these residus: '" + new String(this.iRestrict) + "'.\n"); } else { result.append(aPrepend + "There are no residus that restrict my activity.\n"); } result.append(aPrepend + "My position is '" + ((this.iPosition == Enzyme.CTERM) ? "C-terminal" : "N-terminal") + "'.\n"); result.append(aPrepend + "I currently allow " + ((this.iMiscleavages == 0) ? "no" : "up to " + this.iMiscleavages) + " missed cleavage" + ((this.iMiscleavages == 1) ? "" : "s") + ".\n"); return result.toString(); } /** * This method is the focus of the Enzyme instance. It can perform an * <i>in-silico</i> digest of a Protein sequence according to the * specifications detailed in the construction or via the setters. Using * this methods returns all possible peptides, regardless of length. To only * return peptides within certain lengths use the other cleave method. * * @param aProtein Protein instance to cleave. * @return Protein[] with the resultant peptides. */ public Protein[] cleave(Protein aProtein) { return cleave(aProtein, 0, Integer.MAX_VALUE); } /** * This method is the focus of the Enzyme instance. It can perform an * <i>in-silico</i> digest of a Protein sequence according to the * specifications detailed in the construction or via the setters. Only * returns peptides between the minimum and maximum peptide lengths. * * @param aProtein Protein instance to cleave. * @param minPeptideLength The minimum peptide length to consider * @param maxPeptideLength The maximum peptide length to consider * @return Protein[] with the resultant peptides. */ public Protein[] cleave(Protein aProtein, int minPeptideLength, int maxPeptideLength) { // We'll need a lot of stuff here. // - a Vector for all the startindices // - a Vector for the stopindices // - a Vector of intermediate results. Vector startIndices = new Vector(20, 10); Vector endIndices = new Vector(20, 10); Vector interMed = new Vector(20, 10); // We will also feed the current Protein sequence into a // char[] for easy iteration. char[] sequence = aProtein.getSequence().getSequence().toCharArray(); // Check for a header that contains locations. int headerStart = 0; if (aProtein.getHeader() != null) { headerStart = aProtein.getHeader().getStartLocation() - 1; if (headerStart < 0) { headerStart = 0; } } // Okay, I guess we've set the stage now. // Let's start cleaving! int walkingIndex = 0; for (int i = 0; i < sequence.length; i++) { // Transform the current char into the corresponding wrapper. Character current = Character.valueOf(sequence[i]); // See whether it is a cleavable residu! if (iCleavables.get(current) != null) { // Okay, this should be cleavable. // First of all however, we need to check // for the possible presence of a restrictor! // (And, of course, first check to see whether there is a // next character at all!) if ((i + 1) < sequence.length) { Character next = Character.valueOf(sequence[i + 1]); if (iRestrictors.get(next) != null) { // It is a restrictor! // Just let the loop continue! continue; } } // Since we've gotten to here, we need to cleave here! // So do it! // Oh yeah, and mind the position of cleaving! String temp = null; int start = -1; int end = -1; if (this.iPosition == Enzyme.CTERM) { // Take the part, starting from walkingIndex up to the current // as a new peptide and store it in the interMed Vector. temp = new String(sequence, walkingIndex, ((i - walkingIndex) + 1)); // Start index is human-readable (starting from 1), // hence the '+1'. start = headerStart + walkingIndex + 1; end = headerStart + i + 1; // Start the next peptide after the current one. // An index that so happens to walkingIndex = i + 1; } else if (this.iPosition == Enzyme.NTERM) { temp = new String(sequence, walkingIndex, (i - walkingIndex)); // Start index is human readable: starting from 1. start = headerStart + walkingIndex + 1; end = headerStart + i; walkingIndex = i; } // Add each retrieved value to the correct // Vector. interMed.add(temp); startIndices.add(Integer.valueOf(start)); endIndices.add(Integer.valueOf(end)); } } // Add this point, we should check whether we have // the entire sequence. // We probably don't, because the last cleavable residu will // probably not have been the last residu in the sequence. // That's why we should append the 'remainder' of our cleavage // as well (and the corresponding indices as well, of course). if ((walkingIndex < sequence.length) && (!aProtein.isTruncated() || aProtein.getTruncationPosition() == Protein.CTERMTRUNC)) { interMed.add(new String(sequence, walkingIndex, (sequence.length - walkingIndex))); startIndices.add(Integer.valueOf(headerStart + walkingIndex + 1)); endIndices.add(Integer.valueOf(headerStart + sequence.length)); } // Allright, now we should have all the individual peptides. // Now we should take into account the specified number of miscleavages. // Get all the sequences up to now. String[] imSequences = (String[]) interMed.toArray(new String[interMed.size()]); // Cycle the current sequences. for (int j = 0; j < imSequences.length; j++) { String temp = imSequences[j]; // Apply the number of allowed missed cleavages sequentially from // this sequence. for (int k = 0; k < this.iMiscleavages; k++) { // If we fall outside of the range of current sequences // (for instance if we try to apply a second allowed missed // cleavage to the penultimate peptide, we fall outside of // the available peptides!) // we break the loop. if ((j + k + 1) >= imSequences.length) { break; } // Add our constructed sequence. temp += imSequences[j + k + 1]; interMed.add(temp); startIndices.add(startIndices.get(j)); endIndices.add(endIndices.get(j + k + 1)); } } // Cycle all to check for // Cycle all again, and do a cleanup if C-terminal truncation has been detected. if ((aProtein.isTruncated()) && (aProtein.getTruncationPosition() == Protein.CTERMTRUNC)) { // Okay, C-terminal truncation is flagged. // This means that all peptides with a startindex equal to the startindex // of the parent (or '1' if the parent does not have startindex), are not // realistic peptides, but artifacts of our truncation. // So they get kicked out. int parentStart = aProtein.getHeader().getStartLocation(); if (parentStart < 0) { parentStart = 1; } for (int i = 0; i < interMed.size(); i++) { int start = ((Integer) startIndices.get(i)).intValue(); if (start == parentStart) { startIndices.remove(i); endIndices.remove(i); interMed.remove(i); i--; } } } // We've got all sequences. // Let's construct the Protein instances for them and // then return them! int liSize = interMed.size(); Vector result = new Vector(liSize); Header header = aProtein.getHeader(); // Create the Proteins and store them. for (int i = 0; i < liSize; i++) { // If the sequence comes from a translation, it will contain an '_' if a stopcodon is present. // Omit all sequences containing these. String pepSequence = (String) interMed.get(i); if (pepSequence.indexOf("_") < 0) { // only include peptides within the min and max peptide lengths if (pepSequence.length() >= minPeptideLength && pepSequence.length() <= maxPeptideLength) { Header h = null; if (header != null) { h = (Header) header.clone(); h.setLocation(((Integer) startIndices.get(i)).intValue(), ((Integer) endIndices.get(i)).intValue()); } result.add(new Protein(h, new AASequenceImpl(pepSequence))); } } } Protein[] finalResult = new Protein[result.size()]; result.toArray(finalResult); return finalResult; } /** * This method returns a deep copy of the current Enzyme. * * @return Object Enzyme instance that is a deep copy of the current Enzyme. */ public Object clone() { Enzyme e = null; try { e = (Enzyme) super.clone(); e.iCleavables = this.iCleavables; e.iCleavage = this.iCleavage; e.iMiscleavages = this.iMiscleavages; e.iPosition = this.iPosition; e.iRestrict = this.iRestrict; e.iRestrictors = this.iRestrictors; e.iTitle = this.iTitle; } catch (CloneNotSupportedException cnse) { logger.error(cnse.getMessage(), cnse); } return e; } /** * This method reports on the possibility that the presented subsequence is * the result of enzymatic activity. Note that using a substring, only the * FIRST (starting from the N-terminus) occurrence of the subsequence in the * parent String will be considered! If multiple occurrences are possible, * use the overloaded method that takes indices. Returning int values can be * checked against public static final vars on this class. * * @param aParentSequence String with the parent sequence * @param aSubSequence String with the subsequence * @return int with the coded possibility (1 = Full enzymatic product, 2 = * N-terminal half enzymatic product, 3 = C-terminal half enzymatic product * and 4 = Entirely not an enzymatic product. */ public int isEnzymaticProduct(String aParentSequence, String aSubSequence) { int start = aParentSequence.indexOf(aSubSequence); int end = start + aSubSequence.length(); return this.isEnzymaticProduct(aParentSequence, start + 1, end); } /** * This method reports on the possibility that the presented subsequence * (represented by the start and end location in the parent) is the result * of enzymatic activity. Returning int values can be checked against public * static final vars on this class. * * @param aParentSequence String with the parent sequence * @param aStart int with the start of the subsequence relative to the * parent (first residue is '1'). * @param aEnd int with the end of the subsequence relative to the parent * @return int with the coded possibility (1 = Full enzymatic product, 2 = * N-terminal half enzymatic product, 3 = C-terminal half enzymatic product * and 4 = Entirely not an enzymatic product. */ public int isEnzymaticProduct(String aParentSequence, int aStart, int aEnd) { int result = 0; // Correction for human-readable indices. aStart--; aEnd--; // Check validity of parameters. if ((aStart < 0) || (aEnd < 0)) { throw new IllegalArgumentException("Subsequence is not a subsequence of the parent!"); } if (aEnd > aParentSequence.length() - 1) { throw new IllegalArgumentException("Subsequence end index out of parent length range (" + aEnd + ">" + (aParentSequence.length() - 1) + ")!"); } if (aStart > aEnd) { throw new IllegalArgumentException("Subsequence could not be retreived since start index is greater than end index (" + aStart + ">=" + aEnd + ")!"); } // The maximum length of the sequence. int maxLength = aParentSequence.length(); if (this.getPosition() == Enzyme.CTERM) { // First check N-terminal side. if ((aStart - 1) >= 0) { Character residue = Character.valueOf(aParentSequence.charAt(aStart - 1)); Character possRestrict = Character.valueOf(aParentSequence.charAt(aStart)); if ((this.iCleavables.get(residue) != null) && (this.iRestrictors.get(possRestrict) == null)) { result += 1; } } else { // It is the N-terminus of the parent. This checks out. result += 1; } // Now C-terminal side. Character residue = Character.valueOf(aParentSequence.charAt(aEnd)); if ((this.iCleavables.get(residue) != null) || ((aEnd + 1) == maxLength)) { if ((aEnd + 1) < maxLength) { if (this.iRestrictors.get(Character.valueOf(aParentSequence.charAt(aEnd + 1))) == null) { result += 2; } } else { // The cleavage site appears to be the C-terminal residue in // the parent sequence. This qualifies. result += 2; } } } else { // First check N-terminal side. Character residue = Character.valueOf(aParentSequence.charAt(aStart)); if ((iCleavables.get(residue) != null)) { // The site is potentially cleavable. // What about restriction residues? if (((aStart + 1) < maxLength) && (iRestrictors.get(Character.valueOf(aParentSequence.charAt(aStart + 1))) != null)) { // Do nothing, since it is not a site. } else { // It is a true site. result += 1; } } else if (aStart == 0) { // It is the N-terminus. Validate it. result += 1; } // Now C-terminal side. if ((aEnd + 1) < maxLength) { Character residue2 = Character.valueOf(aParentSequence.charAt(aEnd + 1)); if (this.iCleavables.get(residue2) != null) { if ((aEnd + 2) < maxLength) { if (this.iRestrictors.get(Character.valueOf(aParentSequence.charAt(aEnd + 2))) == null) { result += 2; } } else { // The cleavage site appears to be the C-terminal residue in // the parent sequence. This qualifies. result += 2; } } } else if (aEnd == (maxLength - 1)) { // The C-terminus of the subsequence is the C-terminus of the parent. // This qualifies. result += 2; } } // That's it. Let's have a look, shall we? switch (result) { case 0: // Neither of the termini checked out. result = Enzyme.ENTIRELY_NOT_ENZYMATIC; break; case 1: // Only N-term checked out. result = Enzyme.N_TERM_ENZYMATIC; break; case 2: // Only C-tyerm checked out. result = Enzyme.C_TERM_ENZYMATIC; break; case 3: // Both N- and C-term checked out. result = Enzyme.FULLY_ENZYMATIC; break; default: throw new RuntimeException("A number larger than 3 has been calculated for the 'enzymaticness' of a peptide."); } return result; } }