/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package org.twentyn.proteintodna; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; /** * This is in concept similar to GeneOptimizer, but a fresh re-write. It considers * the first 42 bp as "special" requiring checks for hairpins. It chooses RBS in a * manner similar to RBSChooser2 starting with a list of well-expressing RBS's and * initial 18 bp. It differs in that it considers alternate fill-in codons chosen * to eliminate secondary structure. The remaining codons in the sequence are done * by a more conventional CAI process. Throughout, forbidden sequences are eliminated. */ public class SlidingWindowOptimizer { private Translator translator; private CodonIndexer indexer; private SequenceChecker checker; private RBSChooser3 rbsChooser; private HairpinCounter haircounter; private SlidingWindowOptimizer() {} public static SlidingWindowOptimizer initiate() throws Exception { SlidingWindowOptimizer swo = new SlidingWindowOptimizer(); swo.translator = new Translator(); swo.rbsChooser = RBSChooser3.initiate(); swo.indexer = CodonIndexer.initiate(); swo.checker = new SequenceChecker(); swo.haircounter = new HairpinCounter(); return swo; } public Mrna mRNAconstruct(String peptide, Set<RBSOption> ignores) throws Exception { //Initialize the mRNA with the peptide and the "best" codon for each aa Mrna out = new Mrna(); out.peptide = peptide; out.codons = new int[peptide.length()]; for(int x=0; x<out.codons.length; x++) { out.codons[x] = 0; } //Choose the least edit distance RBS out.rbs = rbsChooser.choose(peptide, ignores); //Optimize while eliminating secondary structure and removing forbidden sites optimizeORF(out); return out; } /** * This is the codon selection algorithm * It scans from N to C terminus of the peptide in 6 amino acid windows * * * It permutes all ~2^6 codon options (64 typically, no more than 4096) * options for that peptide * * It eliminates all peptides that when combined with the previous 8 bp * causes a forbidden sequence * * It scores the remaining for hairpin count and elminates all but the * lowest secondary structure options * * It returns the first (and thus highest total CAI) solution * * It then slides over 3 amino acids, and thus only the first 9 bp from * the previous optimization are retained in the next iteration * * @param mrna * @throws Exception */ private void optimizeORF(Mrna mrna) throws Exception { //Make the mRNA divisible by 3 by adding stop codons int stopcount = mrna.peptide.length() % 3 + 6; //Create temporary codon arrays to hold new CDS String protein = mrna.peptide; int[] codonIndices = new int[mrna.codons.length + stopcount]; String[] codonValues = new String[mrna.codons.length + stopcount]; //Fill in the extra stop codons for(int i=0; i<stopcount; i++) { protein+="*"; //Add stops to the peptide codonIndices[mrna.codons.length + i] = 0; codonValues[mrna.codons.length + i] = "TAA"; } // TODO: @saurabh20n says: More optimizations: The loop below takes about 100ms on average // for each peptide sequence. We have heavily optimized the `for(Integer[] option : encodingOptions)` // loop. But there are still other opportunities to shave away time. // One thought: Memoize the computation for unique `aas6`s. There are 20^6 = 64M possibilities, // and this inner loop runs 10,896 times when a 16 paths (5 orf's each) and because // we run combinations, each protein is optimized multiple times! So an additional // memoization should be done at the higher level to optimize each orf only once // and again and again in pathways. //Scan through peptide 3 amino acids at a time for(int i=0; i<protein.length()-6; i=i+3) { //Pull out the next 6 amino acids as array char[] aas6 = new char[6]; for(int x=0; x<6; x++) { aas6[x] = protein.charAt(i+x); } //Pull out the count of codon options for each amino acid int[] diversity6 = new int[6]; for(int x=0; x<6; x++) { List<String> codons = indexer.aminoAcidToBestCodons.get(aas6[x]); diversity6[x] = codons.size(); } //Permute all codons for next 6 amino acids as encodingOptions List<Integer[]> encodingOptions = new ArrayList<>(); for(int p0=0; p0<diversity6[0]; p0++) { for(int p1=0; p1<diversity6[1]; p1++) { for(int p2=0; p2<diversity6[2]; p2++) { for(int p3=0; p3<diversity6[3]; p3++) { for(int p4=0; p4<diversity6[4]; p4++) { for(int p5=0; p5<diversity6[5]; p5++) { Integer[] option = new Integer[6]; option[0] = p0; option[1] = p1; option[2] = p2; option[3] = p3; option[4] = p4; option[5] = p5; encodingOptions.add(option); } } } } } } //Compute the preamble sequence (the previous 3 amino acids) String preamble = ""; //If this is the first iteration, the preamble is the rbs if(i==0) { preamble = mrna.rbs.rbs.toUpperCase(); //Otherwise it is the previous 3 codons } else { for(int p=i-3; p<i; p++) { preamble += codonValues[p]; } } //Choose the best option, checking for forbidden sequence and count hairpins Integer[] bestOption = null; double bestscore = 99999999; for(Integer[] option : encodingOptions) { //Construct the sequence for the option String sequence = preamble; for(int x=0; x<6; x++) { char aa = aas6[x]; List<String> codons = indexer.aminoAcidToBestCodons.get(aa); String codon = codons.get(option[x]); sequence += codon; } //Apply sequenceChecker to the sequence, abort if it is forbidden boolean checked = checker.check(sequence); if(checked) { //See if it is better with hairpins than the previous, if so update double score = haircounter.score(sequence); if(score < bestscore) { bestscore = score; bestOption = option; } } } //Update the CDS with the bestOption for(int x=0; x<6; x++) { codonIndices[i+x] = bestOption[x]; char aa = protein.charAt(i+x); codonValues[i+x] = indexer.aminoAcidToBestCodons.get(aa).get(bestOption[x]); } } //Update the mRNA with the optimized codons for(int i=0; i<mrna.codons.length; i++) { mrna.codons[i] = codonIndices[i]; } } public static void main(String[] args) throws Exception { String peptide = "MRKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFGYGVQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"; //Create an optimized mRNA for the peptide SlidingWindowOptimizer swo = SlidingWindowOptimizer.initiate(); Set<RBSOption> ignores = new HashSet<>(); long start = System.currentTimeMillis(); Mrna mrna = swo.mRNAconstruct(peptide, ignores); long end = System.currentTimeMillis(); long duration = end - start; double seconds = duration/1000.0; System.out.println("Took this long in seconds: " + seconds); String seq = mrna.toSeq(); System.out.println(seq); //Check the translation String cds = seq.substring(mrna.rbs.rbs.length(), seq.length()-3); String newpep = swo.translator.translate(cds); if(newpep.equals(peptide)) { System.out.println("Translation ok"); } else { throw new Exception(); } } }