/* BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.nbio.ronn; import org.biojava.nbio.core.sequence.ProteinSequence; import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; import org.biojava.nbio.data.sequence.FastaSequence; import org.biojava.nbio.data.sequence.SequenceUtil; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.TreeMap; /** * This class gives public API to RONN functions. * It is build on top of the command line client. Due to this fact a few things * could be improved and extended pending the refactoring of the command line client. * * The input sequence limitations - the input sequence must not contain any ambiguous characters, * and have a minimum length of 19 amino acids. * * @author Peter Troshin * @version 1.0 * @since 3.0.2 * */ public class Jronn implements Serializable { /** * */ private static final long serialVersionUID = 8104272449130849946L; // Load models private static final ModelLoader loader = new ModelLoader(); static { try { loader.loadModels(); } catch (NumberFormatException e) { throw new RuntimeException("Fails to load models!" + e.getMessage(), e); } catch (IOException e) { throw new RuntimeException("Fails to load models!" + e.getMessage(), e); } } /** * Holder for the ranges, contain pointers to starting and ending position * on the sequence which comprises a disordered region. Immutable. * @author pvtroshin */ public static class Range { /** * Range starting position counts from 1 (the first position on the sequence is 1) */ public final int from; /** * The range ending position includes the last residue. */ public final int to; public final float score; public Range(int from, int to, float score) { assert from>=0; assert from<to; this.from = from; this.to = to; this.score = score; } @Override public String toString() { return "Range" + " From:" + from + "\t" + "to: " + to + "\n"; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + from; result = prime * result + to; return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; Range other = (Range) obj; if (from != other.from) return false; if (to != other.to) return false; return true; } } /** * Calculates the probability value for each residue in the protein sequence, * telling the probability that the residue belongs to disordered region. * In general, values greater than 0.5 considered to be in the disordered regions. * * @param sequence an instance of FastaSequence object, holding the name and the sequence. * @return the probability scores for each residue in the sequence */ public static float[] getDisorderScores(FastaSequence sequence) { return predictSerial(sequence); } /** * Calculates the probability value for each residue in the protein sequence, * telling the probability that the residue belongs to disordered region. * In general, values greater than 0.5 considered to be in the disordered regions. * * @param sequence an instance of FastaSequence object, holding the name and the sequence. * @return the probability scores for each residue in the sequence */ public static float[] getDisorderScores(ProteinSequence sequence) { FastaSequence seq = convertProteinSequencetoFasta(sequence); return predictSerial(seq); } /** Utility method to convert a BioJava ProteinSequence object to the FastaSequence * object used internally in JRonn. * * @param sequence * @return */ public static FastaSequence convertProteinSequencetoFasta(ProteinSequence sequence){ StringBuffer buf = new StringBuffer(); for (AminoAcidCompound compound : sequence) { String c = compound.getShortName(); if (! SequenceUtil.NON_AA.matcher(c).find()) { buf.append(c); } else { buf.append("X"); } } return new FastaSequence(sequence.getAccession().getID(),buf.toString()); } private static float[] predictSerial(FastaSequence fsequence) { ORonn.validateSequenceForRonn(fsequence); ORonn ronn; float[] disorder = null; try { ronn = new ORonn(fsequence, loader); disorder = ronn.call().getMeanScores(); } catch (NumberFormatException e) { throw new RuntimeException("Jronn fails to load models " + e.getLocalizedMessage(), e); } catch (IOException e) { throw new RuntimeException("Jronn fails to load models " + e.getLocalizedMessage(), e); } return disorder; } /** * Calculates the disordered regions of the sequence. More formally, the regions for which the * probability of disorder is greater then 0.50. * * * @param sequence an instance of FastaSequence object, holding the name and the sequence. * @return the array of ranges if there are any residues predicted to have the * probability of disorder greater then 0.5, null otherwise. * */ public static Range[] getDisorder(FastaSequence sequence) { float[] scores = getDisorderScores(sequence); return scoresToRanges(scores, RonnConstraint.DEFAULT_RANGE_PROBABILITY_THRESHOLD); } /** * Convert raw scores to ranges. Gives ranges for given probability of disorder value * @param scores the raw probability of disorder scores for each residue in the sequence. * @param probability the cut off threshold. Include all residues with the probability of disorder greater then this value * @return the array of ranges if there are any residues predicted to have the * probability of disorder greater then {@code probability}, null otherwise. */ public static Range[] scoresToRanges(float[] scores, float probability) { assert scores!=null && scores.length>0; assert probability>0 && probability<1; int count=0; int regionLen=0; List<Range> ranges = new ArrayList<Range>(); for(float score: scores) { count++; // Round to 2 decimal points before comparison score = (float) (Math.round(score*100.0)/100.0); if(score>probability) { regionLen++; } else { if(regionLen>0) { ranges.add(new Range(count-regionLen, count-1,score)); } regionLen=0; } } // In case of the range to boundary runs to the very end of the sequence if(regionLen>1) { ranges.add(new Range(count-regionLen+1, count,scores[scores.length-1])); } return ranges.toArray(new Range[ranges.size()]); } /** * Calculates the probability of disorder scores for each residue in the sequence for * many sequences in the input. * * @param sequences the list of the FastaSequence objects * @return the Map with key->FastaSequence, value->probability of disorder for each residue * @see #getDisorder(FastaSequence) */ public static Map<FastaSequence,float[]> getDisorderScores(List<FastaSequence> sequences) { Map<FastaSequence,float[]> results = new TreeMap<FastaSequence, float[]>(); for(FastaSequence fsequence : sequences) { results.put(fsequence, predictSerial(fsequence)); } return results; } /** * Calculates the disordered regions of the sequence for many sequences in the input. * * @param sequences sequences the list of the FastaSequence objects * @return * @see #getDisorder(FastaSequence) */ public static Map<FastaSequence,Range[]> getDisorder(List<FastaSequence> sequences) { Map<FastaSequence,Range[]> disorderRanges = new TreeMap<FastaSequence,Range[]>(); for(FastaSequence fs: sequences) { disorderRanges.put(fs, getDisorder(fs)); } return disorderRanges; } /** * Calculates the disordered regions of the protein sequence. * @param fastaFile input file name containing the sequence in FASTA * @return the Map with key->FastaSequence, value->the list of disordered regions for each sequence * @throws FileNotFoundException if the input file cannot be found * @throws IOException of the system cannot access or read from the input file * @see #getDisorder(FastaSequence) * @see #Jronn.Range */ public static Map<FastaSequence,Range[]> getDisorder(String fastaFile) throws FileNotFoundException, IOException { final List<FastaSequence> sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile)); return getDisorder(sequences); } /** * TODO * * High performance method for calculating disorder. Use multiple threads to achieve the speedup. * * @param fastaFile fully qualified path to the input FASTA file * @param outputFile file name of the file for the results * @param threadNumber the number of threads to use, default * @param controls the format of the result file * @throws FileNotFoundException if input file in not found * @throws IOException if the input or the output files cannot be accessed * @see ORonn.ResultLayout public static void calculateDisorder(String fastaFile, String outputFile, int threadNumber, ResultLayout layout) throws FileNotFoundException, IOException { final List<FastaSequence> sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile)); InputParameters in = new InputParameters(); in.setFilePrm(fastaFile, InputParameters.inputKey); in.setFilePrm(outputFile, InputParameters.outputKey); //in.setThreadNum(Integer.toString(threadNumber)); ORonn.predictParallel(sequences, in, loader); } */ }