package com.compomics.util.protein;
import org.apache.log4j.Logger;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.regex.PatternSyntaxException;
import java.util.*;
/**
* This class implements the functionality of an Enzyme by
* simulating digestion based on a regular expression.
*
* @author Florian Reisinger
* @since 2.9
*/
public class RegExEnzyme extends Enzyme {
// Class specific log4j logger for RegExEnzyme instances.
Logger logger = Logger.getLogger(RegExEnzyme.class);
private Pattern iCleavagePattern = null;
/**
* Create a new RegExEnzyme.
*
* @param aTitle the title
* @param aCleavage the cleavage site
* @param aRestrict the restricting amino acids
* @param aPosition the position
*/
public RegExEnzyme(String aTitle, String aCleavage, String aRestrict, String aPosition) {
this(aTitle, aCleavage, aRestrict, aPosition, 1);
}
/**
* Create a new RegExEnzyme.
*
* @param aTitle the title
* @param aCleavage the cleavage site
* @param aRestrict the restricting amino acids
* @param aPosition the position
* @param aMiscleavages max number of miscleavages
*/
public RegExEnzyme(String aTitle, String aCleavage, String aRestrict, String aPosition, int aMiscleavages) {
// since the cleavage/restriction pattern for this class are expected to be a
// regular expression, we don't want to use the default implementation
super(aTitle, "", aRestrict, aPosition, aMiscleavages);
// now interpret the cleavage/restriction strings as regular expressions
this.setCleavage(aCleavage);
this.setRestrict(aRestrict);
}
/**
* This method can be used to set the cleavage pattern for this RegExEnzyme.
* The pattern is used by the enzyme to find the cleavage sits, where the last
* residue matched by the pattern defines the cleavage site.
* (e.g. in case of C-terminal cleavage, the enzyme cuts right after the match
* of the pattern, and in case of N-terminal cleavage, the enzyme cuts just
* before the last residue matching the pattern)
*
* @param aCleavage a String representing the pattern for the cleavage site.
*
* @throws PatternSyntaxException if the provided string could not be compiled as regular expression.
*/
public void setCleavage(String aCleavage) {
// compile a cleavage Pattern from the specified String
iCleavagePattern = Pattern.compile(aCleavage, Pattern.CASE_INSENSITIVE);
}
public void setCleavage(char[] aCleavage) {
this.setCleavage( new String(aCleavage) );
}
public char[] getCleavage() {
return iCleavagePattern.pattern().toCharArray();
}
public String toString() {
return this.toString("");
}
public String toString(String aPrepend) {
StringBuffer result = new StringBuffer("\n" + aPrepend + "Hi, I'm the RegExEnzyme '" + this.getTitle() + "'.\n");
result.append(aPrepend).append("I cleave at the match to the regular expression: ");
result.append( iCleavagePattern.pattern()).append("\n");
if(this.getRestrict() != null && this.getRestrict().length > 0) {
result.append(aPrepend).append("My activity is restricted by these residus: '");
result.append(new String(this.getRestrict())).append("'.\n");
} else {
result.append(aPrepend).append("There are no residus that restrict my activity.\n");
}
result.append(aPrepend).append("My position is '");
result.append((this.getPosition() == Enzyme.CTERM) ? "C-terminal" : "N-terminal").append("'.\n");
result.append(aPrepend).append("I currently allow ");
result.append((this.getMiscleavages() == 0) ? "no" : "up to " + this.getMiscleavages());
result.append(" missed cleavage").append((this.getMiscleavages() == 1) ? "" : "s").append(".\n");
return result.toString();
}
public Object clone() {
RegExEnzyme ree = (RegExEnzyme)super.clone();
if(ree != null) {
// not be necessary to clone the pattern, because Pattern are immutable
ree.iCleavagePattern = this.iCleavagePattern;
}
return ree;
}
public int isEnzymaticProduct(String aParentSequence, String aSubSequence) {
int start = aParentSequence.indexOf(aSubSequence);
if ( start < 0 ) {
throw new IllegalArgumentException("Subsequence is not a subsequence of the parent!");
}
// now we know that the specified sequence is indeed a subsequence of the parent
// we have to find out whether it is a enzymatic product, e.g. if the sequence
// before/after is cleavage site without restriction
int end = start + aSubSequence.length();
// call isEnzymaticProduct method with corrected start index (first position = 1 instead of 0)
return this.isEnzymaticProduct(aParentSequence, start+1, end);
}
public int isEnzymaticProduct(String aParentSequence, int aStart, int aEnd) {
int result; // place holder that is going to hold the result value
// Check validity of parameters.
if( (aStart-1 < 0) || (aEnd-1 < 0) ) {
throw new IllegalArgumentException("Subsequence is not a subsequence of the parent!");
}
if(aEnd-1 > aParentSequence.length()-1) {
throw new IllegalArgumentException("Subsequence end index out of parent length range (" + aEnd + ">" + (aParentSequence.length()-1) + ")!");
}
if(aStart > aEnd) {
throw new IllegalArgumentException("Subsequence could not be retreived since start index is greater than end index (" + aStart + ">=" + aEnd + ")!");
}
// create a matcher that will find cleavage sites
Matcher matcher = iCleavagePattern.matcher(aParentSequence);
// get all cleavage pattern endpoints (positions where the enzyme cleaves)
// distinguish between C-terminal (pattern end) and N-terminal (pattern end - 1) cleavage
// also take into account the restriction sites (when we do NOT cleave although it is a cleavage site)
int end = 0;
List ends = new ArrayList();
// find the pattern matching sites
while (matcher.find() && end <= aEnd) {
int tmp = matcher.end(); // end of pattern match (site of cleaving)
// check we are not at the end of the sequence and that we are not at a restricted cleavage site
if ( tmp < aParentSequence.length() && iRestrictors.containsKey(Character.valueOf(aParentSequence.charAt(tmp)) ) ) {
// resticted, do not cleave here
} else { // remember the cleavage position
// we are at a valid cleavage site, now we have to distinguish between N-terminal and C-terminal cleavage
if (iPosition == Enzyme.CTERM) {
end = tmp;
} else if (iPosition == Enzyme.NTERM) {
end = tmp - 1;
} else {
throw new IllegalStateException("Cleavage position is not specified correctly! Can not determine if the proviced peptide is a valid enzymatic product!");
}
ends.add(Integer.valueOf(end));
}
}
// check N-terminal side of peptide: the start of the peptide -1 = the position of a cleavage site
// check C-terminal side of peptide: the end of the peptide = the position of a cleavage site
// if both are true we have a fully enzymatic peptide
if ( ends.contains(Integer.valueOf(aStart-1)) || (aStart == 1) ) {
// if the start -1 of the peptide is the end of a cleavage site or it is the start of the entire sequence,
// then we have already a N-terminal peptide. If it also ends at a cleavage site, then it is fully enzymatic
if ( ends.contains(Integer.valueOf(aEnd)) ) {
result = Enzyme.FULLY_ENZYMATIC;
} else {
result = Enzyme.N_TERM_ENZYMATIC;
}
} else if ( ends.contains(Integer.valueOf(aEnd)) || (aEnd == aParentSequence.length()) ) {
// if the end of the peptide is the end of a cleavage site or the peptide ends at the end of the entire
// sequence and since we know it did not start at a cleavage site, it can only be a C-terminal peptide
result = Enzyme.C_TERM_ENZYMATIC;
} else {
result = Enzyme.ENTIRELY_NOT_ENZYMATIC;
}
return result;
}
public Protein[] cleave(Protein aProtein) {
// // // // // // // // // // // // // // // // // // // //
// digest the protein sequence into peptides and record the start/end positions
List peptides = new ArrayList(); // the list that will hold the peptides we generate (Strings)
List startIndices = new ArrayList(); // keep track of the start positions of the peptides (Integer)
List endIndices = new ArrayList(); // keep track of the stop positions of the peptides (Integer)
// !Attention! the order of the next methods is important and shoud not be changed/interrupted
// without checking the correctness of the cleaving algorythm!
cleave(aProtein, peptides, startIndices, endIndices);
handleTruncations(aProtein, peptides, startIndices, endIndices);
addMisCleaved(peptides, startIndices, endIndices);
removeStopCodonPeptides(peptides, startIndices, endIndices);
// // // // // // // // // // // // // // // // // // // //
// now we have the peptides (taking miscleavages into account), lets create protein objects from them
Header header = aProtein.getHeader();
Protein[] result = new Protein[peptides.size()];
Iterator iterator = peptides.iterator();
int i = 0;
while (iterator.hasNext()) {
String peptide = (String) iterator.next();
Header h = (Header)header.clone();
h.setLocation(((Integer)startIndices.get(i)).intValue()+1, ((Integer)endIndices.get(i)).intValue());
result[i] = new Protein( h, new AASequenceImpl(peptide) );
i++;
}
return result;
}
/**
* This method cleans up peptides containing a stop codon "_". The corresponding
* lists of start and end indices are updated accordingly.
*
* @param peptides the list of peptides to scan for "_" containing peptides.
* @param startIndices the corresponfing list of start indices.
* @param endIndices the corresponfing list of end indices.
*/
private void removeStopCodonPeptides(List peptides, List startIndices, List endIndices) {
Iterator pepIter = peptides.iterator();
Iterator startIter = startIndices.iterator();
Iterator endIter = endIndices.iterator();
while (pepIter.hasNext()) {
String peptide = (String) pepIter.next();
// keep iterators in sync
startIter.next();
endIter.next();
// check for stop codon
if (peptide.indexOf("_") >= 0) {
// peptide contains a stop codon and has to be removed
pepIter.remove();
// also remove the corresponding start and end positions
startIter.remove();
endIter.remove();
}
}
}
/**
* This method performs a simple digest following the pattern of the enzyme.
*
* @param aProtein the protein to cleave.
* @param peptides the list that is to hold the peptides.
* @param startIndices the list that is to hold the corresponding start indices for the peptides.
* @param endIndices the list that is to hold the corresponding start indices for the peptides.
*/
private void cleave(Protein aProtein, List peptides, List startIndices, List endIndices) {
String sequence = aProtein.getSequence().getSequence();
// Check for a header that contains locations and if we have to adjust the positions we calculate while cleaving
int headerStart = aProtein.getHeader().getStartLocation()-1;
if(headerStart < 0) {
headerStart = 0;
}
int startPos = 0; // the start position of the current peptide
int currEnd; // the end position of the current peptide
char after; // the amino acid after the cleavage site (where we have to check for restrictions)
Matcher matcher = iCleavagePattern.matcher(sequence);
while (matcher.find()) { // get the next match
currEnd = matcher.end();
// check if the cleavage site is at the end of the sequence string
if (currEnd >= sequence.length()) {
// the cleaveage pattern is right to the end of the sequence
// we don't need to cleave, since there is nothing left after the cleavage site
} else {
// we are not at the end of the sequence, so we record the amino acid after the cleavage site
after = sequence.charAt(currEnd); // character AFTER cleavage site
// if the first character after the cleavage site is a restrictor, we do not cleave!
// only if the next amino acid is not a restricting site, we cleave
if ( iRestrict == null || (Arrays.binarySearch(iRestrict, after) < 0) ) {
// not restricted! we cleave
// but first we have to check at which position Cterm or Nterm
if ( iPosition == Enzyme.CTERM ) {
String pep = sequence.substring(startPos, currEnd);
peptides.add( pep );
startIndices.add(Integer.valueOf(headerStart + startPos) );
endIndices.add( Integer.valueOf(headerStart + startPos + pep.length()) );
startPos = currEnd; // update the start position of the peptide only if we have cleaved
} else if ( iPosition == Enzyme.NTERM ) {
String pep = sequence.substring(startPos, currEnd-1);
peptides.add( pep );
startIndices.add(Integer.valueOf(headerStart + startPos) );
endIndices.add(Integer.valueOf(headerStart + startPos + pep.length()) );
startPos = currEnd-1; // update the start position of the peptide only if we have cleaved
} else {
// should be covered by the constructor (does not allow differnt values form CTERM/NTERM
throw new IllegalStateException("Illegal position (neiter c terminal nor n terminal) while trying to cleave!");
}
} else {
// restricted, we do NOT cleave
}
}
}
// now add the rest of the sequence (everything after the last match)
String lastPep = sequence.substring(startPos, sequence.length());
peptides.add(lastPep);
startIndices.add( Integer.valueOf(headerStart + startPos) );
endIndices.add( Integer.valueOf((headerStart + startPos + lastPep.length())) );
}
/**
* This method checks if the isTruncated flag of the protein is set and if so, it will
* assume that the first (in case of C-terminal truncation) or last (in case of N-terminal
* truncation) peptide are artifacts of that truncation and not real peptides. Accordingly
* the first or last peptide will be removed from the peptide list and start and end positions
* will be updated.
*
* @param aProtein the protein to check (which may has been truncated).
* @param peptides the list of peptides derived from that protein through normal cleavage.
* @param startIndices the list of corresponding start indices for the peptides.
* @param endIndices the list of corresponding end indices for the peptides.
*/
private void handleTruncations(Protein aProtein, List peptides, List startIndices, List endIndices) {
// check weather the protein was truncated and remove the according peptide
// (C-term truncation -> first peptide, N-term truncation -> last peptide)
if (aProtein.isTruncated()) {
if (aProtein.getTruncationPosition() == Protein.CTERMTRUNC) {
// C-terminal truncated protein, we get rid of the FIRST peptide, since
// it is most likely an artefact of the truncation and not a real peptide
peptides.remove(0);
startIndices.remove(0);
endIndices.remove(0);
} else if (aProtein.getTruncationPosition() == Protein.NTERMTRUNC ) {
// N-terminal truncated protein, we get rid of the LAST peptide, since
// it is most likely an artefact of the truncation and not a real peptide
int last = peptides.size() - 1;
peptides.remove(last);
startIndices.remove(last);
endIndices.remove(last);
} else {
throw new IllegalArgumentException("Truncation position is expected to be " +
"either 'Protein.CTERMTRUNC' or 'Protein.NTERMTRUNC'! Protein: " +
aProtein.getHeader().getFullHeaderWithAddenda());
}
}
}
/**
* Method that computes the peptides if miscleavages were allowed.
*
* @param peptides the list of peptides without miscleavages.
* @param startIndices the list of corresponding start indices for the peptides.
* @param endIndices the list of corresponding end indices for the peptides.
*/
private void addMisCleaved(List peptides, List startIndices, List endIndices) {
// // // // // // // // // // // // // // // // // // // //
// Allright, now we should have all the individual peptides.
// Now we should take into account the specified number of miscleavages.
// Get all the sequences up to now.
String[] imSequences = (String[]) peptides.toArray(new String[peptides.size()]);
// Cycle the current sequences.
for(int j=0;j<imSequences.length;j++) {
String temp = imSequences[j];
// Apply the number of allowed missed cleavages sequentially from
// this sequence.
for (int k=0;k<this.iMiscleavages;k++) {
// If we fall outside of the range of current sequences
// (for instance if we try to apply a second allowed missed
// cleavage to the penultimate peptide, we fall outside of
// the available peptides!)
// we break the loop.
if((j+k+1) >= imSequences.length) {
break;
}
// Add our constructed sequence.
temp += imSequences[j+k+1];
peptides.add(temp);
startIndices.add(startIndices.get(j));
endIndices.add(endIndices.get(j+k+1));
}
}
}
}