package org.genedb.util; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import org.biojava.bio.BioException; import org.biojava.bio.seq.DNATools; import org.biojava.bio.seq.RNATools; import org.biojava.bio.seq.io.CrossProductTokenization; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojava.bio.symbol.Alphabet; import org.biojava.bio.symbol.AlphabetManager; import org.biojava.bio.symbol.IllegalSymbolException; import org.biojava.bio.symbol.SimpleSymbolList; import org.biojava.bio.symbol.Symbol; import org.biojava.bio.symbol.SymbolList; import org.biojava.bio.symbol.SymbolListViews; import org.biojava.bio.symbol.TranslationTable; /** * For translating DNA sequences into proteins. Each instance corresponds * to a particular genetic code. * * @author rh11 * */ class Translator { private static final TranslationTable transcriptionTable = RNATools.transcriptionTable(); private static final SymbolTokenization dnaTokenization; static { try { dnaTokenization = DNATools.getDNA().getTokenization("token"); } catch (BioException e) { throw new IllegalStateException("BioJava appears to be broken", e); } } private TranslationTable translationTable; private StartCodonTable startCodonTable; /** * Get the Translator corresponding to the specified genetic code, * as defined by NCBI. * * @see http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi * @param geneticCodeId the ID number of the genetic code * @return */ public static Translator getTranslator(int geneticCodeId) { return new Translator(geneticCodeId); } private Translator(int translationTableId) { this.translationTable = RNATools.getGeneticCode(translationTableId); this.startCodonTable = StartCodonTable.getTable(translationTableId); } public String translate(String dnaSequence, int phase) throws TranslationException { return translate(dnaSequence, phase, false); } public String translate(String dnaSequence, int phase, boolean stopCodonTranslatedAsSelenocysteine) throws TranslationException { try { SymbolList dna = new SimpleSymbolList(dnaTokenization, dnaSequence); SymbolList rna = SymbolListViews.translate(dna, transcriptionTable); rna = rna.subList(phase + 1, phase + 3 * ((rna.length() - phase) / 3)); SymbolList rnaWindowed = SymbolListViews.windowedSymbolList(rna, 3); SymbolList protein = SymbolListViews.translate(rnaWindowed, translationTable); String naiveTranslation = protein.seqString(); if (startCodonTable.contains(rnaWindowed.symbolAt(1))) { naiveTranslation = "M" + naiveTranslation.substring(1); } if (stopCodonTranslatedAsSelenocysteine) { if (naiveTranslation.endsWith("*")) { return naiveTranslation.substring(0, naiveTranslation.length() - 1).replaceAll("\\*", "U") + '*'; } else { return naiveTranslation.replaceAll("\\*", "U"); } } return naiveTranslation; } catch (BioException e) { throw new TranslationException ("Failed to translate cds", e); } } } /** * Represents the start codons used by a particular genetic code, * as a set of BioJava Symbols in the alphabet <code>RNA x RNA x RNA</code>. * * @author rh11 */ class StartCodonTable { public static StartCodonTable getTable(int geneticCodeId) { if (geneticCodeId < 0 || geneticCodeId >= tables.length || tables[geneticCodeId] == null) throw new IllegalArgumentException(String.format("No such genetic code (%d)", geneticCodeId)); return tables[geneticCodeId]; } private static final char[] bases = new char[] {'U', 'C', 'A', 'G'}; private static final SymbolTokenization codonTokenization; static { final Alphabet rnaAlphabet = AlphabetManager.alphabetForName("RNA"); final Alphabet codonAlphabet = AlphabetManager.alphabetForName("(RNA x RNA x RNA)"); try { final List<SymbolTokenization> threeTokens = Collections.nCopies(3, rnaAlphabet.getTokenization("token")); codonTokenization = new CrossProductTokenization(codonAlphabet, threeTokens); } catch (BioException e) { throw new RuntimeException("BioJava appears to be broken", e); } } private Set<Symbol> symbols = new HashSet<Symbol> (); private static final StartCodonTable[] tables = new StartCodonTable[] { /* 0 */ null, /* 1 */ new StartCodonTable("---M---------------M---------------M----------------------------"), /* 2 */ new StartCodonTable("--------------------------------MMMM---------------M------------"), /* 3 */ new StartCodonTable("----------------------------------MM----------------------------"), /* 4 */ new StartCodonTable("--MM---------------M------------MMMM---------------M------------"), /* 5 */ new StartCodonTable("---M----------------------------MMMM---------------M------------"), /* 6 */ new StartCodonTable("-----------------------------------M----------------------------"), /* 7 */ null, /* 8 */ null, /* 9 */ new StartCodonTable("-----------------------------------M---------------M------------"), /* 10 */ new StartCodonTable("-----------------------------------M----------------------------"), /* 11 */ new StartCodonTable("---M---------------M------------MMMM---------------M------------"), /* 12 */ new StartCodonTable("-------------------M---------------M----------------------------"), /* 13 */ new StartCodonTable("---M------------------------------MM---------------M------------"), /* 14 */ new StartCodonTable("-----------------------------------M----------------------------"), /* 15 */ new StartCodonTable("-----------------------------------M----------------------------"), /* 16 */ new StartCodonTable("-----------------------------------M----------------------------"), /* 17 */ null, /* 18 */ null, /* 19 */ null, /* 20 */ null, /* 21 */ new StartCodonTable("-----------------------------------M---------------M------------"), /* 22 */ new StartCodonTable("-----------------------------------M----------------------------"), /* 23 */ new StartCodonTable("--------------------------------M--M---------------M------------"), }; /** * Create a new StartCodonTableImpl from a string representing the 'starts' * line in the NCBI format. * * @param starts */ private StartCodonTable(String starts) { char[] startsChars = starts.toCharArray(); int i = 0; for (char base1 : bases) { for (char base2 : bases) { for (char base3 : bases) { if (startsChars[i++] == 'M') { final String codonString = String.format("(%c %c %c)", base1, base2, base3); try { final Symbol codonSymbol = codonTokenization.parseToken(codonString); symbols.add(codonSymbol); } catch (IllegalSymbolException exception) { throw new RuntimeException( String.format( "BioJava failed to recognise codon '%s'. This should never happen.", codonString), exception); } } } } } } /** * Does this set contain the specified symbol? * * @param symbol * @return */ public boolean contains(Symbol symbol) { return symbols.contains(symbol); } }