package org.molgenis.mutation.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.molgenis.mutation.dto.ExonDTO;
public class SequenceUtils
{
private static final transient Logger logger = Logger.getLogger(SequenceUtils.class.getSimpleName());
public static enum AminoAcid
{
GCT("A", "Ala"), GCC("A", "Ala"), GCA("A", "Ala"), GCG("A", "Ala"), CGT("R", "Arg"), CGC("R", "Arg"), CGA("R",
"Arg"), CGG("R", "Arg"), AGA("R", "Arg"), AGG("R", "Arg"), AAT("N", "Asn"), AAC("N", "Asn"), GAT("D",
"Asp"), GAC("D", "Asp"), TGT("C", "Cys"), TGC("C", "Cys"), CAA("Q", "Gln"), CAG("Q", "Gln"), GAA("E",
"Glu"), GAG("E", "Glu"), GGT("G", "Gly"), GGC("G", "Gly"), GGA("G", "Gly"), GGG("G", "Gly"), CAT("H",
"His"), CAC("H", "His"), ATT("I", "Ile"), ATC("I", "Ile"), ATA("I", "Ile"), TTA("L", "Leu"), TTG("L",
"Leu"), CTT("L", "Leu"), CTC("L", "Leu"), CTA("L", "Leu"), CTG("L", "Leu"), AAA("K", "Lys"), AAG("K",
"Lys"), ATG("M", "Met"), TTT("F", "Phe"), TTC("F", "Phe"), CCT("P", "Pro"), CCC("P", "Pro"), CCA("P",
"Pro"), CCG("P", "Pro"), TCT("S", "Ser"), TCC("S", "Ser"), TCA("S", "Ser"), TCG("S", "Ser"), AGT("S",
"Ser"), AGC("S", "Ser"), TAA("X", "Ter"), TAG("X", "Ter"), TGA("X", "Ter"), ACT("T", "Thr"), ACC("T",
"Thr"), ACA("T", "Thr"), ACG("T", "Thr"), TGG("W", "Trp"), TAT("Y", "Tyr"), TAC("Y", "Tyr"), GTT("V",
"Val"), GTC("V", "Val"), GTA("V", "Val"), GTG("V", "Val");
private final String code_1;
private final String code_3;
AminoAcid(String code_1, String code_3)
{
this.code_1 = code_1;
this.code_3 = code_3;
}
public String getCode_1()
{
return this.code_1;
}
public String getCode_3()
{
return this.code_3;
}
}
public static String getAminoAcid1(String codon)
{
for (AminoAcid aminoAcid : AminoAcid.values())
if (aminoAcid.name().equals(codon)) return aminoAcid.getCode_1();
return "?";
}
public static String getAminoAcid3(String codon)
{
for (AminoAcid aminoAcid : AminoAcid.values())
if (aminoAcid.name().equals(codon)) return aminoAcid.getCode_3();
return "???";
}
public static String toAminoAcid3(String code1)
{
for (AminoAcid aminoAcid : AminoAcid.values())
if (aminoAcid.getCode_1().equals(code1)) return aminoAcid.getCode_3();
return "???";
}
/**
* Convert a amino acid sequence in one-letter-notation to
* three-letter-notation
*
* @param one
* -letter-notation
* @return three-letter-notation
*/
public static String convAaSequence(String seq)
{
StringBuffer buf = new StringBuffer();
for (char aminoAcid : seq.toCharArray())
buf.append(SequenceUtils.toAminoAcid3(new Character(aminoAcid).toString()));
return buf.toString();
}
public static void main(String[] args)
{
String one = "MSVGELYSQCTRVWIPDPDEVWRSAELTKDYKEGDKSLQLRLEDETILEYPIDVQRNQLPFLRNPDILVGENDLTALSYLHEPAVLHNLKVRFLESNHIYTYCGIVLVAINPYEQLPIYGQDVIYTYSGQNMGDMDPHIFAVAEEAYKQMARDEKNQSIIVSGESGAGKTVSAKYAMRYFATVGGSASETNIEEKVLASSPIMEAIGNAKTTRNDNSSRFGKYIQIGFDKRYHIIGANMRTYLLEKSRVVFQADDERNYHIFYQLCAAAGLPEFKELALTSAEDFFYTSQGGDTSIEGVDDAEDFEKTRQAFTLLGVKESHQMSIFKIIASILHLGSVAIQAERDGDSCSISPQDVYLSNFCRLLGVEHSQMEHWLCHRKLVTTSETYVKTMSLQQVINARNALAKHIYAQLFGWIVEHINKALHTSLKQHSFIGVLDIYGFETFEVNSFEQFCINYANEKLQQQFNSHVFKLEQEEYMKEQIPWTLIDFYDNQPCIDLIEAKLGILDLLDEECKVPKGTDQNWAQKLYDRHSSSQHFQKPRMSNTAFIIVHFADKVEYLSDGFLEKNRDTVYEEQINILKASKFPLVADLFHDDKDPVPATTPGKGSSSKISVRSARPPMKVSNKEHKKTVGHQFRTSLHLLMETLNATTPHYVRCIKPNDEKLPFHFDPKRAVQQLRACGVLETIRISAAGYPSRWAYHDFFNRYRVLVKKRELANTDKKAICRSVLENLIKDPDKFQFGRTKIFFRAGQVAYLEKLRADKFRTATIMIQKTVRGWLQKVKYHRLKGATLTLQRYCRGHLARRLAEHLRRIRAAVVLQKHYRMQRARQAYQRVRRAAVVIQAFTRAMFVRRTYRQVLMEHKATTIQKHVRGWMARRHFQRLRDAAIVIQCAFRMLKARRELKALRIEARSAEHLKRLNVGMENKVVQLQRKIDEQNKEFKTLSEQLSVTTSTYTMEVERLKKELVHYQQSPGEDTSLRLQEEVESLRTELQRAHSERKILEDAHSREKDELRKRVADLEQENALLKDEKEQLNNQILCQSKDEFAQNSVKENLMKKELEEERSRYQNLVKEYSQLEQRYDNLRDEMTIIKQTPGHRRNPSNQSSLESDSNYPSISTSEIGDTEDALQQVEEIGLEKAAMDMTVFLKLQKRVRELEQERKKLQVQLEKREQQDSKKVQAEPPQTDIDLDPNADLAYNSLKRQELESENKKLKNDLNELRKAVADQATQNNSSHGSPDSYSLLLNQLKLAHEELEVRKEEVLILRTQIVSADQRRLAGRNAEPNINARSSWPNSEKHVDQEDAIEAYHGVCQTNSKTEDWGYLNEDGELGLAYQGLKQVARLLEAQLQAQSLEHEEEVEHLKAQLEALKEEMDKQQQTFCQTLLLSPEAQVEFGVQQEISRLTNENLDLKELVEKLEKNERKLKKQLKIYMKKAQDLEAAQALAQSERKRHELNRQVTVQRKEKDFQGMLEYHKEDEALLIRNLVTDLKPQMLSGTVPCLPAYILYMCIRHADYTNDDLKVHSLLTSTINGIKKVLKKHNDDFEMTSFWLSNTCRLLHCLKQYSGDEGFMTQNTAKQNEHCLKNFDLTEYRQVLSDLSIQIYQQLIKIAEGVLQPMIVSAMLENESIQGLSGVKPTGYRKRSSSMADGDNSYCLEAIIRQMNAFHTVMCDQGLDPEIILQVFKQLFYMINAVTLNNLLLRKDVCSWSTGMQLRYNISQLEEWLRGRNLHQSGAVQTMEPLIQAAQLLQLKKKTQEDAEAICSLCTSLSTQQIVKILNLYTPLNEFEERVTVAFIRTIQAQLQERNDPQQLLLDAKHMFPVLFPFNPSSLTMDSIHIPACLNLEFLNEV";
System.out.println(">>> three==" + SequenceUtils.convAaSequence(one));
String two = "erPheGluGlnPh";
System.out.println(">>> fulls==" + SequenceUtils.getNumFullAminoAcids(two) + ", parts=="
+ SequenceUtils.getNumPartAminoAcids(two));
}
/**
* Get the number of amino acids in given sequence, excluding the ones only
* partially present
*
* @param sequence
* @return Number of amino acids
* @throws Exception
*/
public static int getNumFullAminoAcids(String sequence)
{
String[] tokens = StringUtils.splitByCharacterTypeCamelCase(sequence);
if (tokens.length < 1) return 0;
int numFullAas = tokens.length;
if (tokens[0].length() < 3) numFullAas--;
if (tokens[tokens.length - 1].length() < 3) numFullAas--;
return numFullAas;
}
/**
* Get the number of amino acids in given sequence, including the ones only
* partially present
*
* @param sequence
* @return Number of amino acids
* @throws Exception
*/
public static Integer getNumPartAminoAcids(String sequence)
{
String[] tokens = StringUtils.splitByCharacterTypeCamelCase(sequence);
return tokens.length;
}
/**
* Get the number of Gly-X-Y repeats, i.e. the number of GlyXxxYyy repeats.
*
* @param sequence
* @return number of Gly-X-Y repeats
*/
public static Integer getNumGlyXYRepeats(String sequence)
{
int numGly = 0; // counter for total number of Gly in sequence
for (int i = 0; i < sequence.length(); i++)
{
int localNumGly = 0;
int glyPos = StringUtils.indexOf(sequence, "Gly", i);
while (glyPos + 8 < sequence.length())
{
localNumGly++;
glyPos = glyPos + 9;
if (!StringUtils.substring(sequence, glyPos, glyPos + 3).equals("Gly")) break;
}
if (localNumGly > numGly) numGly = localNumGly;
}
return numGly;
}
/**
* Splice sequence, i.e. remove intronic part which has to be in lowercase
* letters
*
* @param sequence
* @return spliced sequence
*/
public static String splice(String sequence)
{
String sequence2 = sequence;
sequence2 = StringUtils.replace(sequence2, "a", "");
sequence2 = StringUtils.replace(sequence2, "c", "");
sequence2 = StringUtils.replace(sequence2, "g", "");
sequence2 = StringUtils.replace(sequence2, "t", "");
// TODO: Hardcoded removal of UTR of exon 1
// sequence2 = StringUtils.substring(sequence2, 108);
return sequence2;
}
/**
* Get the amino acid sequence (translate) for a spliced nucleotide sequence
*
* @param nuclSequence
* @return amino acid sequence
*/
public static String translate(String nuclSequence)
{
String aaSequence = "";
for (int i = 0; i < nuclSequence.length() - 1; i = i + 3)
{
String codon = StringUtils.substring(nuclSequence, i, i + 3);
aaSequence = aaSequence + SequenceUtils.getAminoAcid3(codon);
}
return aaSequence;
}
/**
* Get index of first occurence of codon after given startCodonNum
*
* @param sequence
* @param startCodonNum
* @return codon number of first occurence, -1 if no match or null sequence
* input
*/
public static int indexOfCodon(String sequence, String codon, int startCodonNum)
{
if (sequence == null || codon == null) return -1;
int codonPos = StringUtils.indexOf(sequence, codon, startCodonNum * 3) + 1; // NT
// pos
// starts
// at
// 1
// not
// 0
return SequenceUtils.getCodonNum(codonPos);
}
/**
* Get the nucleotide at the given position starting at 1
*
* @param sequence
* @param indexOf
* @return nucleotide
*/
public static String getNucleotide(String sequence, int indexOf)
{
return StringUtils.substring(sequence, indexOf - 1, indexOf);
}
/**
* Get number of codon at given position
*
* @param position
* @return codon number
*/
public static int getCodonNum(int position)
{
return new Double(Math.ceil(position / 3.0)).intValue();
}
/**
* Get the codon with the given number starting at 1
*
* @param sequence
* @param codonNum
* @return codon
*/
public static String getCodon(String sequence, int codonNum)
{
int start = codonNum * 3 - 3;
int end = start + 3; // 2;
logger.debug("getCodon: start==" + start + ", end==" + end);
return StringUtils.substring(sequence, start, end);
/*
* for (int i = 0; i < sequence.length(); i = i + 3) if ((i + 3) / 3 ==
* codonNum) return sequence.substring(i, i + 3); return "";
*/
}
/**
* Get the codon at the given position starting at 0
*
* @param sequence
* @param position
* @return codon
*/
public static String getCodonByPosition(String sequence, int position)
{
if (sequence == null) return "";
return StringUtils.substring(sequence, position, position + 3);
}
/**
* Get the cDNA position. In case of an intronic position remove additive
* part (e.g. 1234+5 -> 1234)
*
* @param position
* @return cDNA position
* @throws RESyntaxException
*/
public static int getCDNAPosition(String position)
{
Pattern reExon = Pattern.compile("^(\\d+)$");
Pattern reIntron = Pattern.compile("^(\\d+)([+-])(\\d+)$");
Matcher mExon = reExon.matcher(position);
Matcher mIntron = reIntron.matcher(position);
if (mExon.matches()) return new Integer(mExon.group(1));
else if (mIntron.matches()) return new Integer(mIntron.group(1));
else
return 0;
}
/**
* Get position where 'add' has been added.
*
* @param position
* @param add
* @return Added position
* @throws RESyntaxException
*/
public static String getAddedPosition(String position, Integer add)
{
Pattern reExon = Pattern.compile("^(\\d+)$");
Pattern reIntron = Pattern.compile("^(\\d+)([+-])(\\d+)$");
Matcher mExon = reExon.matcher(position);
Matcher mIntron = reIntron.matcher(position);
if (mExon.matches()) return new Integer(new Integer(mExon.group(1)) + add).toString();
else if (mIntron.matches()) if (mIntron.group(2).equals("-")) if (add < new Integer(mIntron.group(3))) return mIntron
.group(1) + mIntron.group(2) + (new Integer(mIntron.group(3)).intValue() - add.intValue());
else
return String.valueOf(new Integer(mIntron.group(1)).intValue() - new Integer(mIntron.group(3)).intValue()
+ add.intValue());
else
return mIntron.group(1) + mIntron.group(2) + (new Integer(mIntron.group(3)).intValue() + add.intValue());
else
return "0";
}
/**
* Get the gDNA position.
*
* @param position
* @param exon
* @return gDNA position
* @throws RESyntaxException
*/
public static int getGDNAPosition(String position, ExonDTO exonDTO)
{
Pattern reExon = Pattern.compile("^(\\d+)$");
Pattern reIntron = Pattern.compile("^(\\d+)([+-])(\\d+)$");
Matcher mExon = reExon.matcher(position);
Matcher mIntron = reIntron.matcher(position);
if (mExon.matches())
{
if ("R".equals(exonDTO.getOrientation()))
{
// exon.gDNA - (mutation.cDNA - exon.cDNA)
return Math.abs(exonDTO.getGdnaStart() - (Integer.valueOf(mExon.group(1)) - exonDTO.getCdnaStart()));
}
else
{
// exon.gDNA + (mutation.cDNA - exon.cDNA)
return Math.abs(exonDTO.getGdnaStart() + (Integer.valueOf(mExon.group(1)) - exonDTO.getCdnaStart()));
}
}
else if (mIntron.matches())
{
if (mIntron.group(2).equals("+"))
{
if ("R".equals(exonDTO.getOrientation()))
{
// intron.gdnaPos + 1 (back to exon) - difference ('+' means
// downstream)
return Math.abs(exonDTO.getGdnaStart() + 1 - Integer.valueOf(mIntron.group(3)));
}
else
{
// intron.gdnaPos - 1 (start at last position of exon) +
// difference ('+' means upstream)
return Math.abs(exonDTO.getGdnaStart() - 1 + Integer.valueOf(mIntron.group(3)));
}
}
else
{
if ("R".equals(exonDTO.getOrientation()))
{
// intron.gdnaPos - intron.length + difference ('-' means
// upstream)
return Math.abs(exonDTO.getGdnaStart() - exonDTO.getLength() + Integer.valueOf(mIntron.group(3)));
}
else
{
// intron.gdnaPos - difference ('-' means downstream)
return Math.abs(exonDTO.getGdnaStart() - Integer.valueOf(mIntron.group(3)));
}
}
}
else
return 0;
}
/**
* Get start position (in bases) of first triplet change in two sequences
*
* @param sequence1
* @param sequence2
* @return first triplet change position
*/
public static int getFirstTripletChange(String sequence1, String sequence2)
{
int changePos = StringUtils.indexOfDifference(sequence1, sequence2) + 1;
int tripletChangePos = changePos % 3 > 0 ? changePos % 3 : 3;
int tripletStart = changePos - tripletChangePos;
return tripletStart;
}
}