package com.compomics.util.experiment.biology;
import com.compomics.util.experiment.personalization.ExperimentObject;
import com.compomics.util.preferences.DigestionPreferences;
import com.compomics.util.preferences.SequenceMatchingPreferences;
import com.compomics.util.protein.Header.DatabaseType;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
/**
* This class models a protein.
*
* @author Marc Vaudel
* @author Harald Barsnes
*/
public class Protein extends ExperimentObject {
/**
* The version UID for Serialization/Deserialization compatibility.
*/
static final long serialVersionUID = 1987224639519365761L;
/**
* The protein accession.
*/
private String accession;
/**
* Boolean indicating if the protein is not existing (decoy protein for
* instance).
*/
private boolean decoy;
/**
* The protein sequence.
*/
private String sequence;
/**
* The protein database type.
*/
private DatabaseType databaseType;
/**
* Constructor for a protein.
*/
public Protein() {
}
/**
* Simplistic constructor for a protein (typically used when loading
* identification files).
*
* @param accession The protein accession
* @param isDecoy boolean indicating whether the protein is a decoy
*/
public Protein(String accession, boolean isDecoy) {
this.accession = accession;
this.decoy = isDecoy;
}
/**
* Constructor for a protein.
*
* @param accession The protein accession
* @param sequence The protein sequence
* @param isDecoy boolean indicating whether the protein is a decoy
*/
public Protein(String accession, String sequence, boolean isDecoy) {
this.accession = accession;
this.sequence = sequence;
this.decoy = isDecoy;
}
/**
* Constructor for a protein.
*
* @param accession The protein accession
* @param databaseType The protein database the protein comes from
* @param sequence The protein sequence
* @param isDecoy boolean indicating whether the protein is a decoy
*/
public Protein(String accession, DatabaseType databaseType, String sequence, boolean isDecoy) {
this.accession = accession;
this.databaseType = databaseType;
this.sequence = sequence;
this.decoy = isDecoy;
}
/**
* Indicates if the protein is factice (from a decoy database for instance).
*
* @return a boolean indicating if the protein is factice
*/
public boolean isDecoy() {
return decoy;
}
/**
* Getter for the protein accession.
*
* @return the protein accession
*/
public String getAccession() {
return accession;
}
/**
* Getter for the protein database type.
*
* @return the protein database type
*/
public DatabaseType getDatabaseType() {
return databaseType;
}
/**
* Getter for the protein sequence.
*
* @return the protein sequence
*/
public String getSequence() {
return sequence;
}
/**
* A method to compare proteins. For now accession based.
*
* @param anotherProtein an other protein
* @return a boolean indicating if the proteins are identical
*/
public boolean isSameAs(Protein anotherProtein) {
return accession.equals(anotherProtein.getAccession());
}
/**
* Returns the key for protein indexing. For now the protein accession.
*
* @return the key for protein indexing.
*/
public String getProteinKey() {
return accession;
}
/**
* Returns the number of amino acids in the sequence.
*
* @return the number of amino acids in the sequence
*/
public int getLength() {
return sequence.length();
}
/**
* Returns the observable amino acids in the sequence when using the given
* enzymes with the given maximal peptide length.
*
* @param enzymes the enzymes to use
* @param pepMaxLength the max peptide length
*
* @return the number of observable amino acids of the sequence
*/
public int[] getObservableAminoAcids(ArrayList<Enzyme> enzymes, double pepMaxLength) {
int lastCleavage = 0, tempLength = 1;
int[] observableAas = new int[sequence.length()];
for (int i = 0; i < sequence.length() - 1; i++) {
boolean cleavage = false;
char charati = sequence.charAt(i), charatiPlusOne = sequence.charAt(i + 1);
for (Enzyme enzyme : enzymes) {
if (enzyme.isCleavageSite(charati, charatiPlusOne)) {
cleavage = true;
break;
}
}
if (cleavage) {
if (tempLength <= pepMaxLength) {
for (int k = lastCleavage; k < i; k++) {
observableAas[k] = 1;
}
} else {
for (int k = lastCleavage; k < i; k++) {
observableAas[k] = 0;
}
}
lastCleavage = i;
tempLength = 0;
}
tempLength++;
}
if (tempLength <= pepMaxLength) {
for (int k = lastCleavage; k < sequence.length(); k++) {
observableAas[k] = 1;
}
} else {
for (int k = lastCleavage; k < sequence.length(); k++) {
observableAas[k] = 0;
}
}
return observableAas;
}
/**
* Returns the number of observable amino acids in the sequence.
*
* @param enzymes the enzymes to use
* @param pepMaxLength the max peptide length
*
* @return the number of observable amino acids of the sequence
*/
public int getObservableLength(ArrayList<Enzyme> enzymes, double pepMaxLength) {
int[] observalbeAas = getObservableAminoAcids(enzymes, pepMaxLength);
int observableLength = 0;
for (int observable : observalbeAas) {
observableLength += observable;
}
return observableLength;
}
/**
* Returns the number of cleavage sites.
*
* @param enzymes the enzymes to use
*
* @return the number of possible peptides
*/
public int getNCleavageSites(ArrayList<Enzyme> enzymes) {
int nCleavageSites = 0;
for (int i = 0; i < sequence.length() - 1; i++) {
char charati = sequence.charAt(i), charatiPlusOne = sequence.charAt(i + 1);
for (Enzyme enzyme : enzymes) {
if (enzyme.isCleavageSite(charati, charatiPlusOne)) {
nCleavageSites++;
break;
}
}
}
return nCleavageSites;
}
/**
* Returns the protein's molecular weight. (Note that when using a
* SequenceFactory it is recommended to use the SequenceFactory's
* computeMolecularWeight method instead, as that method stored the computed
* molecular weights instead of recalculating them every time.)
*
* @return the protein's molecular weight in Da
*/
public double computeMolecularWeight() {
double mass = Atom.H.getMonoisotopicMass();
for (int iaa = 0; iaa < sequence.length(); iaa++) {
char aa = sequence.charAt(iaa);
try {
if (aa != '*') {
AminoAcid currentAA = AminoAcid.getAminoAcid(aa);
mass += currentAA.getMonoisotopicMass();
}
} catch (NullPointerException e) {
if (aa == '>') {
throw new IllegalArgumentException("Error parsing the sequence of " + accession);
} else {
throw new IllegalArgumentException("Unknown amino acid: " + aa);
}
} catch (IllegalArgumentException e) {
if (aa == '>') {
throw new IllegalArgumentException("Error parsing the sequence of " + accession + ". Protein sequence: " + sequence + ".");
} else {
throw new IllegalArgumentException("Unknown amino acid: " + aa);
}
}
}
mass += Atom.H.getMonoisotopicMass() + Atom.O.getMonoisotopicMass();
return mass;
}
/**
* Returns the list of indexes where a peptide can be found in the protein
* sequence. 1 is the first amino acid.
*
* @param peptideSequence the sequence of the peptide of interest
* @param sequenceMatchingPreferences the sequence matching preferences
*
* @return the list of indexes where a peptide can be found in a protein
* sequence
*/
public ArrayList<Integer> getPeptideStart(String peptideSequence, SequenceMatchingPreferences sequenceMatchingPreferences) {
AminoAcidPattern aminoAcidPattern = AminoAcidPattern.getAminoAcidPatternFromString(peptideSequence);
return aminoAcidPattern.getIndexes(sequence, sequenceMatchingPreferences);
}
/**
* Returns a boolean indicating whether the protein starts with the given
* peptide.
*
* @param peptideSequence the peptide sequence
* @param sequenceMatchingPreferences the sequence matching preferences
*
* @return a boolean indicating whether the protein starts with the given
* peptide
*/
public boolean isNTerm(String peptideSequence, SequenceMatchingPreferences sequenceMatchingPreferences) {
String subSequence = sequence.substring(0, peptideSequence.length());
AminoAcidSequence aminoAcidPattern = new AminoAcidSequence(peptideSequence);
if (aminoAcidPattern.matchesIn(subSequence, sequenceMatchingPreferences)) {
return true;
}
if (sequence.charAt(0) == 'M' && sequence.length() > peptideSequence.length()) {
subSequence = sequence.substring(1, peptideSequence.length() + 1);
if (aminoAcidPattern.matchesIn(subSequence, sequenceMatchingPreferences)) {
return true;
}
}
return false;
}
/**
* Returns a boolean indicating whether the protein ends with the given
* peptide.
*
* @param peptideSequence the peptide sequence
* @param sequenceMatchingPreferences the sequence matching preferences
*
* @return a boolean indicating whether the protein ends with the given
* peptide
*/
public boolean isCTerm(String peptideSequence, SequenceMatchingPreferences sequenceMatchingPreferences) {
String subSequence;
if (sequence.length() <= peptideSequence.length()) {
subSequence = sequence;
} else {
subSequence = sequence.substring(sequence.length() - peptideSequence.length() - 1);
}
AminoAcidSequence aminoAcidPattern = new AminoAcidSequence(peptideSequence);
return aminoAcidPattern.matchesIn(subSequence, sequenceMatchingPreferences);
}
/**
* Returns true if the peptide is enzymatic, i.e., both termini can be
* generated by the enzyme used. If a peptide maps to multiple locations in
* the protein sequence this method returns true if one or more of these
* peptides are enzymatic, even if not all mappings are enzymatic.
*
* @param peptideSequence the peptide sequence to check
* @param enzyme the enzyme to use
* @param sequenceMatchingPreferences the sequence matching preferences
*
* @return true of the peptide is non-enzymatic
*
* @throws IOException if an IOException occurs
*/
public boolean isEnzymaticPeptide(String peptideSequence, Enzyme enzyme, SequenceMatchingPreferences sequenceMatchingPreferences) throws IOException {
// get the surrounding amino acids
HashMap<Integer, String[]> surroundingAminoAcids = getSurroundingAA(peptideSequence, 1, sequenceMatchingPreferences);
String firstAA = peptideSequence.charAt(0) + "";
String lastAA = peptideSequence.charAt(peptideSequence.length() - 1) + "";
// iterate the possible extended peptide sequences
for (int index : surroundingAminoAcids.keySet()) {
String before = surroundingAminoAcids.get(index)[0];
String after = surroundingAminoAcids.get(index)[1];
if ((enzyme.isCleavageSite(before, firstAA) && enzyme.isCleavageSite(lastAA, after)
|| (before.length() == 0 && enzyme.isCleavageSite(lastAA, after)
|| (enzyme.isCleavageSite(before, firstAA) && after.length() == 0)))) {
return true;
}
}
return false;
}
/**
* Returns true if the peptide is enzymatic, i.e., both termini can be
* generated by one of the enzymes used. If a peptide maps to multiple
* locations in the protein sequence this method returns true if one or more
* of these peptides are enzymatic, even if not all mappings are enzymatic.
*
* @param peptideSequence the peptide sequence to check
* @param enzymes the enzymes to use
* @param sequenceMatchingPreferences the sequence matching preferences
*
* @return true of the peptide is non-enzymatic
*
* @throws IOException if an IOException occurs
*/
public boolean isEnzymaticPeptide(String peptideSequence, ArrayList<Enzyme> enzymes, SequenceMatchingPreferences sequenceMatchingPreferences) throws IOException {
for (Enzyme enzyme : enzymes) {
if (isEnzymaticPeptide(peptideSequence, enzyme, sequenceMatchingPreferences)) {
return true;
}
}
return false;
}
/**
* Returns the amino acids surrounding a peptide in the sequence of the
* given protein in a map: peptide start index > (amino acids before,
* amino acids after).
*
* @param peptide the sequence of the peptide of interest
* @param nAA the number of amino acids to include
* @param sequenceMatchingPreferences the sequence matching preferences
*
* @return the amino acids surrounding a peptide in the protein sequence
*
* @throws IOException Exception thrown whenever an error occurred while
* parsing the protein sequence
*/
public HashMap<Integer, String[]> getSurroundingAA(String peptide, int nAA, SequenceMatchingPreferences sequenceMatchingPreferences) throws IOException {
ArrayList<Integer> startIndexes = getPeptideStart(peptide, sequenceMatchingPreferences);
HashMap<Integer, String[]> result = new HashMap<Integer, String[]>();
for (int startIndex : startIndexes) {
result.put(startIndex, new String[2]);
String subsequence = "";
int stringIndex = startIndex - 1;
for (int aa = stringIndex - nAA; aa < stringIndex; aa++) {
if (aa >= 0 && aa < sequence.length()) {
subsequence += sequence.charAt(aa);
}
}
result.get(startIndex)[0] = subsequence;
subsequence = "";
for (int aa = stringIndex + peptide.length(); aa < stringIndex + peptide.length() + nAA; aa++) {
if (aa >= 0 && aa < sequence.length()) {
subsequence += sequence.charAt(aa);
}
}
result.get(startIndex)[1] = subsequence;
}
return result;
}
}