package statalign.model.subst.plugins; import java.awt.Color; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import statalign.io.RawSequences; import statalign.model.score.plugins.DNAScore; import statalign.model.subst.RecognitionError; import statalign.model.subst.SubstitutionModel; /** * This is the abstract class for nucleotide models. * * @author lyngsoe */ public abstract class NucleotideModel extends SubstitutionModel{ /* Old model parameters */ protected double oldparams[]; public static String type = "nucleotide"; private boolean printRNA; // true if should print RNA nucleotides (U for T) /** * This constructor reads the alphabet from data/DNAalphabet.dat * @throws IOException */ protected NucleotideModel() throws IOException{ attachedScoringScheme = new DNAScore(); String alphabetFile = "data/DNAalphabet.dat"; ClassLoader cl = getClass().getClassLoader(); BufferedReader bf = new BufferedReader(new InputStreamReader(cl.getResource(alphabetFile).openStream())); String a = bf.readLine(); alphabet = new char[a.length()]; for(int i = 0; i < alphabet.length; i++){ alphabet[i] = a.charAt(i); } int size = alphabet.length; v = new double[size][size]; w = new double[size][size]; d = new double[size]; e = new double[size]; } /** * Reads rates and equilibriums from given files. Does not set the params array. * * @param rateFile substitution rates file * @param equFile equilibrium probabilities file */ protected NucleotideModel(String rateFile, String equFile) throws IOException { attachedScoringScheme = new DNAScore(); String alphabetFile = "data/DNAalphabet.dat"; ClassLoader cl = getClass().getClassLoader(); //System.out.println(cl+" "+cl.getResource(alphabetFile)+" "+alphabetFile); BufferedReader bf = new BufferedReader(new InputStreamReader(cl.getResource(alphabetFile).openStream())); // BufferedReader bf = new BufferedReader(new FileReader(alphabetFile)); String a = bf.readLine(); alphabet = new char[a.length()]; for(int i = 0; i < alphabet.length; i++){ alphabet[i] = a.charAt(i); } int size = alphabet.length; v = new double[size][size]; w = new double[size][size]; d = new double[size]; e = new double[size]; bf = new BufferedReader(new InputStreamReader(cl.getResource(rateFile).openStream())); // bf = new BufferedReader(new FileReader(rate)); String[] temp; for(int i = 0; i < size; i++){ temp = (bf.readLine()).split(" "); for(int j = 0; j < size; j++){ v[i][j] = Double.parseDouble(temp[j]); } } String t = bf.readLine(); //reading an empty line for(int i = 0; i < size; i++){ temp = (bf.readLine()).split(" "); for(int j = 0; j < size; j++){ // System.out.println(i+" "+j+" "+temp[j]); w[i][j] = Double.parseDouble(temp[j]); } } t = bf.readLine(); //reading an empty line bf = new BufferedReader(new InputStreamReader(cl.getResource(equFile).openStream())); // bf = new BufferedReader(new FileReader(equilibrium)); for(int i = 0; i < size; i++){ t = bf.readLine(); e[i] = Double.parseDouble(t); } } /** * This function decides if this model can analyze input sequences. * accepted characters are 'acgutryswmkbdhvn'. Our program is * case insensitive. */ @Override public double acceptable(RawSequences r) { String acceptableCharacters = "acgturyswmkbdhvn"; int[] count = new int[5]; // nucleotide counts for(int i = 0; i < r.size(); i++){ String sequence = r.getSequence(i); for(int j = 0; j < sequence.length(); j++){ char ch = sequence.charAt(j); if(ch != '-' && ch != ' ') { ch = Character.toLowerCase(ch); int ind = acceptableCharacters.indexOf(ch); if(ind == -1){ throw new RecognitionError(getMenuName()+" cannot be used with the current sequences because they contain the character '"+ch+"'!\n"); } else if(ind < count.length){ count[ind]++; } } } } int uind = count.length-1; if(count[uind] > count[uind-1]) printRNA = true; // automatically choose RNA output if seen more U than T else if(count[uind-1] > count[uind]) printRNA = false; count[uind-1] += count[uind]; // then treat Us as Ts int sum = 0; for(int i = 0; i < uind; i++){ if(count[i] > 0) sum++; } return (double)sum/uind; } /** * This function assigns colors to characters. 'A' is red, 'C' is blue, * 'G' is orange, 'T' or 'U' is green, the remaining characters (ambiguous characters * and the gap symbol) are grey */ @Override public Color getColor(char c) { c = Character.toUpperCase(c); switch (c) { case 'A': return Color.RED; case 'C': return Color.BLUE; case 'G': return Color.ORANGE; case 'T': case 'U': return Color.GREEN; default: return Color.GRAY; } } /** * returns the description of the current parameters of the model */ @Override public String print() { return ""; } void setDiagonal(){} /** * restore the parameters to the old values when a parameter-changing * proposal is not accepted. */ @Override public void restoreParameter() { for(int i = 0; i < params.length; i++){ params[i] = oldparams[i]; } setDiagonal(); } /** * Returns with the most likely character, given a Felsentein * likelihood array. It can handle ambiguous characters. If the * probabilities in the Felsentein array are all zero, returns '*'. */ @Override public char mostLikely(double[] seq) { /* Create binary encoding of characters with probability * higher than 0.5. */ int code = 0; int radix = 1; double max = 0.0; char character = '*'; for(int i = 0; i < seq.length; i++){ if(seq[i] > 0.5){ /* Character has probability higher than 0.5 */ if (max > 0.5) /* ...but is not the first high probability * character seen. */ code += radix; else{ /* ...and is the first high probability character seen */ max = seq[i]; code = radix; } } else{ /* Character has probability at most 0.5 */ if (seq[i] > max){ /* ...but is most probable character seen so far */ max = seq[i]; code = radix; } else if (seq[i] == max) /* ...but is among the most probable characters * seen so far. */ code += radix; } radix <<= 1; } if (max == 0.0) code = 0; /* Look up corresponding character */ String conversionTable = printRNA ? "*ACMGRSVUWYHKDBN" : "*ACMGRSVTWYHKDBN"; character = conversionTable.charAt(code); return character; } /** * Selects whether to print RNA nucleotides (U for T) or default DNA ones. * Affects the beviour of {@link #mostLikely(double[])}. */ public void setPrintRNA(boolean printRNA) { this.printRNA = printRNA; } }