/* * Copyright (C) Lennart Martens * * Contact: lennart.martens AT UGent.be (' AT ' to be replaced with '@') */ /* * Created by IntelliJ IDEA. * User: Lennart * Date: 6-jan-03 * Time: 9:48:16 */ package com.compomics.util.nucleotide; import org.apache.log4j.Logger; import com.compomics.util.interfaces.Sequence; import com.compomics.util.general.MassCalc; import com.compomics.util.general.UnknownElementMassException; import com.compomics.util.protein.AASequenceImpl; import com.compomics.util.protein.Header; import java.util.Properties; import java.util.Vector; import java.util.HashMap; import java.io.IOException; /* * CVS information: * * $Revision: 1.6 $ * $Date: 2007/07/06 09:41:54 $ */ /** * This class represents a nucleotide sequence (RNA or DNA). * * @author Lennart Martens */ public class NucleotideSequenceImpl implements Sequence { // Class specific log4j logger for NucleotideSequenceImpl instances. Logger logger = Logger.getLogger(NucleotideSequenceImpl.class); /** * The sequence. */ private String iSequence = null; /** * The mass. It uses lazy caching. */ private double iMass = -1.0; /** * The Properties object with the conversion from DNA to * proteins. */ private static Properties iTranslate = null; /** * HashMap that defines the complementary nucleotides. */ private static Properties iComplement = null; /** * This constructor allows the construction of an Object, * wrapping a nucleotidesequence. The default codon usage table * for translation will be used when translating. * * @param aSequence String with the nucleotide sequence. */ public NucleotideSequenceImpl(String aSequence) { this.setSequence(aSequence); } /** * This constructor allows the construction of an Object, * wrapping a nucleotidesequence. Translation will be done using * the specified codon usage table. * * @param aSequence String with the nucleotide sequence. * @param aCodonUsageTable Properties instance with the codon usage table * (triplet is key, single-letter amino acid is value). */ public NucleotideSequenceImpl(String aSequence, Properties aCodonUsageTable) { iTranslate = aCodonUsageTable; this.setSequence(aSequence); } /** * This constructor allows the construction of an Object, * wrapping a nucleotidesequence. Translation will be done using * the specified codon usage table. * * @param aSequence String with the nucleotide sequence. * @param aCodonUsageTable Properties instance with the codon usage table * (triplet is key, single-letter amino acid is value). */ public NucleotideSequenceImpl(String aSequence, String aCodonUsageTable) { if(aCodonUsageTable != null) { iTranslate = loadProps(aCodonUsageTable); if(iTranslate.size() == 0) { throw new IllegalArgumentException("Unable to read or parse your codon usage table!"); } } this.setSequence(aSequence); } /** * This method will set the sequence. <br> * Note that most implementations will also allow you * to set this via the constructor. * * @param aSequence String with the sequence. */ public void setSequence(String aSequence) { this.iSequence = aSequence.trim().toUpperCase(); this.iMass = -1.0; } /** * This method reports on the length of the current sequence. * * @return int with the length of the sequence. */ public int getLength() { return iSequence.length(); } /** * This method will return the mass for the sequence. * * @return double with the mass. */ public double getMass() { // Lazy caching of the mass. if(iMass < 0.0) { try { this.iMass = new MassCalc(MassCalc.MONONUCLEOTIDES).calculateMass(iSequence); } catch(UnknownElementMassException ueme) { logger.error(ueme.getMessage(), ueme); } } return iMass; } /** * This method will retrieve the sequence. * * @return String with the sequence. */ public String getSequence() { return iSequence; } /** * This method translates the specified nucleotidesequence into * the six reading frames. * If an unkown nucleic acid 'N' is part of the codon, amino * acid 'X' will be inserted in the string. * * @return AASequenceImpl[] with the maximum of 6 translated protein sequences. */ public AASequenceImpl[] translate() { Vector seqs = new Vector(6); // We'll translate all based on the translation map. // This map is lazily cached at class level. if(iTranslate == null) { iTranslate = this.loadProps("DNA_Protein_Translation.properties"); } // We need to do sense first, and subsequently the complement. for(int i=0;i<3;i++) { String tempSeq = this.translate(iSequence, i); if((tempSeq != null) && (!tempSeq.trim().equals(""))) { seqs.add(new AASequenceImpl(tempSeq)); } } // Now the reverse complement. String inverse = this.getReverseComplementary(); for(int i=0;i<3;i++) { String tempSeq = this.translate(inverse, i); if((tempSeq != null) && (!tempSeq.trim().equals(""))) { seqs.add(new AASequenceImpl(tempSeq)); } } AASequenceImpl[] result = new AASequenceImpl[seqs.size()]; seqs.toArray(result); return result; } /** * This method translates the specified nucleotidesequence into * the six reading frames. While doing so, distinct entries are * generated when stop codons are encountered in the nucleotide sequence. * If an unkown nucleic acid 'N' is part of the codon, amino * acid 'X' will be inserted in the string. * * * @param aDatabaseIdentifier String to include the database origin in the protein entry annotation. * @param aShortOrganism String to include the organism origin in the protein entry annotation. * @return Vector with the maximum of 6 translated reading frames. * Each Vector element is a HashMap with all the proteins from one reading frame. * HashMap structure : Key - Header Instance ; Value - Protein instance of the translation. */ public Vector translateToStopCodonSeparatedEntries(String aDatabaseIdentifier, String aShortOrganism) { Vector allSeqs = new Vector(6); // We'll translate all based on the translation map. // This map is lazily cached at class level. if(iTranslate == null) { iTranslate = this.loadProps("DNA_Protein_Translation.properties"); } // We need to do sense first, and subsequently the complement. for(int i=0;i<3;i++) { String lEntryIdentifier = aDatabaseIdentifier + "-sense-" + (i+1); HashMap lHashSeq = this.translateEntriesSeparatedByStopCodon(iSequence, i, lEntryIdentifier, aShortOrganism); if(!lHashSeq.isEmpty()) { allSeqs.add(lHashSeq); } } // Now the reverse complement. String inverse = this.getReverseComplementary(); for(int i=0;i<3;i++) { String lEntryIdentifier = aDatabaseIdentifier + "-antisense-" + (i+1); HashMap lHashSeq = this.translateEntriesSeparatedByStopCodon(inverse, i, lEntryIdentifier, aShortOrganism); if(!lHashSeq.isEmpty()) { allSeqs.add(lHashSeq); } } return allSeqs; } /** * This method returns the reverse complementary strand for the * sequence. * * @return String with the reverse complementary sequence. */ public String getReverseComplementary() { // Lazy cache of complementary residues. if(iComplement == null) { iComplement = this.loadProps("complementaryNucleotides.properties"); } StringBuffer complement = new StringBuffer(); for(int i=iSequence.length();i>0;i--) { String key = iSequence.substring(i-1, i); complement.append(iComplement.get(key)); } return complement.toString(); } /** * This method will translate the specified DNA sequence into a * single String of amino acids, starting from the specified * base. * If an unkown nucleic acid 'N' is part of the codon, amino * acid 'X' will be inserted in the string. * * @param aSequence String with the sequence to be translated. * @param aStartFrame int with the index of the element that we * will start from (it is modulo'd by 3!!) * @return String with the translated sequence. */ private String translate(String aSequence, int aStartFrame) { StringBuffer aminoacidsequence = new StringBuffer(); aStartFrame = aStartFrame%3; for(int i=aStartFrame;i<aSequence.length()-2;i+=3) { String key = aSequence.substring(i, i+3); // If An Unknown Nucleic Acid 'N' is part of the Codon -> append amino acid 'X'. if(key.indexOf('N') >= 0){ aminoacidsequence.append("X"); }else{ // Otherwise translate the codon to the correct amino acid by the Codon Usage Table. aminoacidsequence.append((String)iTranslate.get(key)); } } return aminoacidsequence.toString(); } /** * This method will translate the specified DNA sequence * starting from the specified base into a HashMap with * a Header instance as a keys and an amino acid sequence as values. * A new entry is created each time we hit a STOP-CODON * in the nucleic acid sequence. * * If an unkown nucleic acid 'N' is part of the codon, amino * acid 'X' will be inserted in the string. * * @param aSequence String with the sequence to be translated. * @param aStartFrame int with the index of the element that we * will start from (it is modulo'd by 3!!) * @param aEntryIdentifier String that will be used in the Protein accession. * @param aShortOrganism String to include the organism origin in the protein entry annotation. * @return HashMap Key - Header Instance ; HashMap Value - Protein instance of the translation. * * <br /><strong>NOTE!</strong> Each Header instance contains the originale nucleic acid sequence that was being translated in the iRest field. */ private HashMap translateEntriesSeparatedByStopCodon(String aSequence, int aStartFrame, String aEntryIdentifier, String aShortOrganism) { HashMap seqs = new HashMap(); int counter = 0; StringBuffer aminoacidSequence = new StringBuffer(); aStartFrame = aStartFrame%3; String key = null; try { for(int i=aStartFrame;i<aSequence.length()-2;i+=3) { key = aSequence.substring(i, i+3); // 1. If An Unknown Nucleic Acid 'N' is part of the Codon -> append amino acid 'X'. if(key.indexOf('N') >= 0){ aminoacidSequence.append("X"); // 2. If a STOP-CODON ("_") is encountered // OR // If this is the last loop, we have to add a new entry in the sequence hashmap. }else if((iTranslate.get(key).equals("_")) || ((i+3) >= aSequence.length()-2)){ // Don't forget to attach the last codon! if((key.length() == 3) && (!iTranslate.get(key).equals("_"))){ aminoacidSequence.append((String)iTranslate.get(key)); } String sequence = aminoacidSequence.toString(); // If two stop codons are next to eachother, we have an empty protein sequence. Don't do anything then! if (!sequence.equals("")) { counter = counter + 1; int start = i - ((sequence.length())*3); int stop = i - 1; // Include the last entry if no stop codon. if(!iTranslate.get(key).equals("_")){ stop = stop + 3; start = start + 3; } String nucleicSequence = aSequence.substring(start, (stop+1)); // Protein accession is formed here. String lEntryIdentifier = aShortOrganism + counter + "_" + aEntryIdentifier; Header lEntryHeader = Header.parseFromFASTA(lEntryIdentifier + " [" + (start+1) + "-" + (stop+1) + "]" ); // Set the protein accession. lEntryHeader.setAccession(lEntryHeader.getAccession()); // Set the nucleic sequence. lEntryHeader.setRest(nucleicSequence); seqs.put(lEntryHeader, aminoacidSequence.toString()); // "reset" the StringBuffer && continue the loop. aminoacidSequence = new StringBuffer(); } }else{ // 3. Otherwise translate the codon into the correct amino acid by the Codon Usage Table. aminoacidSequence.append((String)iTranslate.get(key)); } } } catch (Exception e) { logger.error(e.getMessage(), e); } return seqs; } /** * This method loads a Properties instance from the classpath. * It returns an empty instance and displays an error message * if the Properties instance was not found. * * @param aPropFileName String with the filename for the * properties file. * @return Properties with the props from the file, or an empty * instance if the file was not found. */ private Properties loadProps(String aPropFileName) { Properties p = new Properties(); try { p.load(this.getClass().getClassLoader().getResourceAsStream(aPropFileName)); } catch(IOException ioe) { logger.error("\nProperties file ("+aPropFileName+") not found in classpath!"); logger.error("All resultant values will be computed to 0.0!!\n"); } return p; } }