package com.compomics.util.experiment.biology; import com.compomics.util.experiment.biology.aminoacids.*; import com.compomics.util.preferences.SequenceMatchingPreferences; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * Class representing amino acids. * * @author Marc Vaudel * @author Harald Barsnes */ public abstract class AminoAcid implements Serializable { /** * Serial number for backward compatibility. */ static final long serialVersionUID = -3158896310928354857L; public static final AminoAcid A = new Alanine(); public static final AminoAcid C = new Cysteine(); public static final AminoAcid D = new AsparticAcid(); public static final AminoAcid E = new GlutamicAcid(); public static final AminoAcid F = new Phenylalanine(); public static final AminoAcid G = new Glycine(); public static final AminoAcid H = new Histidine(); public static final AminoAcid I = new Isoleucine(); public static final AminoAcid K = new Lysine(); public static final AminoAcid L = new Leucine(); public static final AminoAcid M = new Methionine(); public static final AminoAcid N = new Asparagine(); public static final AminoAcid P = new Proline(); public static final AminoAcid Q = new Glutamine(); public static final AminoAcid R = new Arginine(); public static final AminoAcid S = new Serine(); public static final AminoAcid T = new Threonine(); public static final AminoAcid V = new Valine(); public static final AminoAcid W = new Tryptophan(); public static final AminoAcid Y = new Tyrosine(); public static final AminoAcid U = new Selenocysteine(); public static final AminoAcid O = new Pyrrolysine(); public static final AminoAcid B = new B(); public static final AminoAcid J = new J(); public static final AminoAcid Z = new Z(); public static final AminoAcid X = new X(); /** * Single letter code of the amino acid. */ public String singleLetterCode; /** * Three letter code of the amino acid. */ public String threeLetterCode; /** * Name of the amino acid. */ public String name; /** * Average mass of the amino acid. */ public double averageMass; /** * Monoisotopic mass of the amino acid. * * @deprecated use the atomchain */ protected Double monoisotopicMass; /** * The monoisotopic atom chain. */ protected AtomChain monoisotopicAtomChain; /** * The mass tolerance used for the indistinguishable amino acids in cache. */ private Double indistinguishableAACacheMass = null; /** * The sub amino acids. */ protected char[] subAminoAcidsWithoutCombination; /** * The sub amino acids. */ protected char[] subAminoAcidsWithCombination; /** * The amino acid combinations. */ protected char[] aminoAcidCombinations; /** * The standard genetic code. */ protected String[] standardGeneticCode; /** * The amino acid one letter codes as char array. */ private static final char[] aminoAcidChars = new char[]{'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'Y', 'U', 'O', 'V', 'W', 'B', 'J', 'Z', 'X'}; /** * A char array of the one letter code of amino acids without combinations * of amino acids. */ private static final char[] uniqueAminoAcidChars = new char[]{'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'Y', 'U', 'O', 'V', 'W'}; /** * The amino acid one letter codes as string array. */ public static final String[] aminoAcidStrings = new String[]{"A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "Y", "U", "O", "V", "W", "B", "J", "Z", "X"}; /** * Convenience method returning an array of all implemented amino acids * represented by their singe letter code. * * @return an array of all implemented amino acids */ public static char[] getAminoAcids() { return aminoAcidChars; } /** * Returns the single letter code as character. * * @return the single letter code as character */ public char getSingleLetterCodeAsChar() { return singleLetterCode.charAt(0); } /** * Convenience method returning an arrayList of all implemented amino acids. * * @return an arrayList of all implemented amino acids represented by their * character */ public static List<String> getAminoAcidsList() { return Arrays.asList(aminoAcidStrings); } /** * Returns a char array of the one letter code of amino acids without * combinations of amino acids. * * @return a char array of the one letter code of amino acids without * combinations of amino acids */ public static char[] getUniqueAminoAcids() { return uniqueAminoAcidChars; } /** * Returns the amino acid corresponding to the letter given, null if not * implemented. If more than one letter is given only the first one will be * accounted for. * * @param aa the amino acid single letter code as a String * @return the corresponding amino acid. */ public static AminoAcid getAminoAcid(String aa) { return getAminoAcid(aa.charAt(0)); } /** * Returns the amino acid corresponding to the letter given, null if not * implemented. * * @param letter the letter given * @return the corresponding amino acid. */ public static AminoAcid getAminoAcid(char letter) { switch (letter) { case 'A': case 'a': return AminoAcid.A; case 'C': case 'c': return AminoAcid.C; case 'D': case 'd': return AminoAcid.D; case 'E': case 'e': return AminoAcid.E; case 'F': case 'f': return AminoAcid.F; case 'G': case 'g': return AminoAcid.G; case 'H': case 'h': return AminoAcid.H; case 'I': case 'i': return AminoAcid.I; case 'K': case 'k': return AminoAcid.K; case 'L': case 'l': return AminoAcid.L; case 'M': case 'm': return AminoAcid.M; case 'N': case 'n': return AminoAcid.N; case 'P': case 'p': return AminoAcid.P; case 'Q': case 'q': return AminoAcid.Q; case 'R': case 'r': return AminoAcid.R; case 'S': case 's': return AminoAcid.S; case 'T': case 't': return AminoAcid.T; case 'V': case 'v': return AminoAcid.V; case 'W': case 'w': return AminoAcid.W; case 'Y': case 'y': return AminoAcid.Y; case 'B': case 'b': return AminoAcid.B; case 'Z': case 'z': return AminoAcid.Z; case 'X': case 'x': return AminoAcid.X; case 'U': case 'u': return AminoAcid.U; case 'J': case 'j': return AminoAcid.J; case 'O': case 'o': return AminoAcid.O; default: throw new IllegalArgumentException("No amino acid found for letter " + letter + "."); } } /** * Indicates whether the amino acid object refers to a combination of amino * acids like B, J, Z or X. * * @return an boolean indicating whether the amino acid object refers to a * combination of amino acids like B, J, Z or X */ public abstract boolean iscombination(); /** * In case of a combination of amino acids, returns the comprised amino * acids or amino acid groups represented by their single letter code * including sub combinations. Example: Z > {G, Q}. * * @return the actual amino acids */ public char[] getSubAminoAcids() { return getSubAminoAcids(true); } /** * In case of a combination of amino acids, returns the comprised amino * acids or amino acid groups represented by their single letter code. * Example: Z > {G, Q}. * * @param includeCombinations if true, sub-amino acids which are amino acids * combinations like Z will also be included * * @return the actual amino acids */ public char[] getSubAminoAcids(boolean includeCombinations) { if (includeCombinations) { return subAminoAcidsWithCombination; } else { return subAminoAcidsWithoutCombination; } } /** * Returns the amino acids combinations which might represent this amino * acid. Example: g > {Z, X}. * * @return the amino acids combinations which might represent this amino * acid */ public char[] getCombinations() { return aminoAcidCombinations; } /** * Returns a matching amino acid using the given preferences. The amino acid * is unique when different possibilities are found, then for instance I is * returned for both I and L. The first of the amino acid string array is * returned. * * @param aminoAcid the single letter code of the amino acid of interest * @param sequenceMatchingPreferences the sequence matching preferences * * @return a matching amino acid using the given matching type and * massTolerance */ public static String getMatchingAminoAcid(String aminoAcid, SequenceMatchingPreferences sequenceMatchingPreferences) { AminoAcid aa = AminoAcid.getAminoAcid(aminoAcid); AminoAcidPattern aaPattern = AminoAcidPattern.getAminoAcidPatternFromString(aminoAcid); for (String candidateAA : aminoAcidStrings) { if (aaPattern.matches(candidateAA, sequenceMatchingPreferences)) { if (!aa.iscombination()) { return candidateAA; } else { char[] subAas = aa.getSubAminoAcids(); boolean subAa = false; for (char aaChar : subAas) { if (aaChar == candidateAA.charAt(0)) { subAa = true; break; } } if (!subAa) { return candidateAA; } } } } throw new IllegalArgumentException("No unique amino acid found for amino acid " + aminoAcid); } /** * Returns the matching sequence of a given sequence. For example both * PEPTLDE and PEPTIDE will return PEPTIDE when I and L are considered as * indistinguishable. See getMatchingAminoAcid for more details. * * @param sequence the sequence of interest * @param sequenceMatchingPreferences the sequence matching preferences * * @return the matching sequence */ public static String getMatchingSequence(String sequence, SequenceMatchingPreferences sequenceMatchingPreferences) { StringBuilder stringBuilder = new StringBuilder(sequence.length()); for (int i = 0; i < sequence.length(); i++) { String aa = String.valueOf(sequence.charAt(i)); aa = getMatchingAminoAcid(aa, sequenceMatchingPreferences); stringBuilder.append(aa); } return stringBuilder.toString(); } /** * Returns the standard genetic triplets associated to this amino acid. * * @return the standard genetic triplets associated to this amino acid */ public String[] getStandardGeneticCode() { return standardGeneticCode; } /** * Returns the amino acid from the standard genetic code. Null if not coding for an amino acid. * * @param geneticCode the three letter genetic code of the desired amino * acid * * @return the amino acid from the standard genetic code */ public static AminoAcid getAminoAcidFromGeneticCode(String geneticCode) { if (geneticCode.equals("TTT") || geneticCode.equals("TTC")) { return F; } else if (geneticCode.equals("TTA") || geneticCode.equals("TTG") || geneticCode.equals("CTT") || geneticCode.equals("CTC") || geneticCode.equals("CTA") || geneticCode.equals("CTG")) { return L; } else if (geneticCode.equals("ATT") || geneticCode.equals("ATC") || geneticCode.equals("ATA")) { return I; } else if (geneticCode.equals("ATG")) { return M; } else if (geneticCode.startsWith("GT")) { return V; } else if (geneticCode.startsWith("TC")) { return S; } else if (geneticCode.startsWith("CC")) { return P; } else if (geneticCode.startsWith("AC")) { return T; } else if (geneticCode.startsWith("GC")) { return A; } else if (geneticCode.equals("TAT") || geneticCode.equals("TAC")) { return Y; } else if (geneticCode.equals("CAT") || geneticCode.equals("CAC")) { return H; } else if (geneticCode.equals("CAA") || geneticCode.equals("CAG")) { return Q; } else if (geneticCode.equals("AAT") || geneticCode.equals("AAC")) { return N; } else if (geneticCode.equals("AAA") || geneticCode.equals("AAG")) { return K; } else if (geneticCode.equals("GAT") || geneticCode.equals("GAC")) { return D; } else if (geneticCode.equals("GAA") || geneticCode.equals("GAG")) { return E; } else if (geneticCode.equals("TGT") || geneticCode.equals("TGC")) { return C; } else if (geneticCode.equals("TGG")) { return W; } else if (geneticCode.startsWith("CG")) { return R; } else if (geneticCode.equals("AGT") || geneticCode.equals("AGC")) { return S; } else if (geneticCode.equals("AGA") || geneticCode.equals("AGG")) { return R; } else if (geneticCode.startsWith("GG")) { return G; } else if (geneticCode.equals("TAG")) { return O; } else if (geneticCode.equals("TGA")) { return U; } return null; } /** * Returns the genetic code as combination of the sub amino acid genetic * codes. * * @return the genetic code as combination of the sub amino acid genetic * codes */ protected String[] getStandardGeneticCodeForCombination() { ArrayList<String> uniqueCodes = new ArrayList<String>(); for (char aa : getSubAminoAcids()) { AminoAcid aminoAcid = AminoAcid.getAminoAcid(aa); if (!aminoAcid.iscombination()) { for (String code : aminoAcid.getStandardGeneticCode()) { if (!uniqueCodes.contains(code)) { uniqueCodes.add(code); } } } } return uniqueCodes.toArray(new String[uniqueCodes.size()]); } /** * Returns the monoisotopic atom chain representing this amino acid. * * @return the monoisotopic atom chain representing this amino acid */ public AtomChain getMonoisotopicAtomChain() { return monoisotopicAtomChain; } /** * Returns the mass of the amino acid. * * @return the mass of the amino acid */ public Double getMonoisotopicMass() { if (monoisotopicAtomChain == null) { return monoisotopicMass; } return monoisotopicAtomChain.getMass(); } @Override public boolean equals(Object obj) { if (obj instanceof AminoAcid) { if (((AminoAcid) obj).singleLetterCode.equalsIgnoreCase(singleLetterCode)) { return true; } else { return false; } } else { return false; } } }