/* * Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br) * Copyright (C) 2008,2009,2010,2011,2012 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com) * * For further information check the LICENSE file. */ package bio.pih.genoogle.statistics; import java.util.Map; import bio.pih.genoogle.alignment.SubstitutionMatrix; import bio.pih.genoogle.seq.Alphabet; import bio.pih.genoogle.seq.SymbolList; import com.google.common.collect.Maps; /** * Class to calculate statistics values for similar sequences searching process. * * @author albrecht (felipe.albrecht@gmail.com) **/ public class SubstitutionMatrixStatistics implements Statistics { /** * Calculate the probability of each score and return a {@link Map} with the probabilities. */ private Map<Integer, Double> scoreProbabilities(SubstitutionMatrix sm, SymbolList query) throws IndexOutOfBoundsException { int min = sm.getMin(); int max = sm.getMax(); int delta; if (min < 0) { delta = Math.abs(min); } else { delta = -min; } int scoreProbabilitiesSize = (min + delta) + (max + delta) + 1; double[] scoreProbabilities = new double[scoreProbabilitiesSize]; for (int i = min + delta; i <= max + delta; i++) { scoreProbabilities[i] = 0.0; } int numRegularLettersInQuery = 0; int length = query.getLength(); for (int i = 1; i <= length; i++) { if (alphabet.isValid(query.symbolAt(i))) { numRegularLettersInQuery++; } } double probPrior = 1000 / sm.getSymbolsCount(); double probability = probPrior / numRegularLettersInQuery; for (int i = 1; i <= length; i++) { char a = query.symbolAt(i); for (char c : sm.getSymbols()) { int score = sm.getValue(a, c); scoreProbabilities[score + delta] += probability; } } final double sum = 1000.00; for (int i = min + delta; i <= max+ delta; i++) { double p = scoreProbabilities[i]; scoreProbabilities[i] = p / sum; } Map<Integer, Double> scoreProbabilitiesMap = Maps.newHashMap(); for (int i = min; i <= max; i++) { scoreProbabilitiesMap.put(i, scoreProbabilities[i + delta]); } return scoreProbabilitiesMap; } /** * Calculate the lambda from the scores. * * @param prob * Scores probabilities. * @param mismatch * mismatch score. * @param match * match score. * @return Lambda value. */ private Double calculateLambda(Map<Integer, Double> prob, SubstitutionMatrix sm) { double lambda0 = BLAST_KARLIN_LAMBDA0_DEFAULT; for (int j = 0; j < 20; j++) { double sum = -1.0; double slope = 0.0; if (lambda0 < 0.01) { break; } double x0 = Math.exp(lambda0); double x1 = Math.pow(x0, sm.getMin() - 1); if (Double.compare(x1, 0.0) == 0) { break; } for (int i = sm.getMin(); i <= sm.getMax(); i++) { x1 *= x0; double temp = prob.get(i) * x1; sum += temp; slope += temp * i; } double amt = sum / slope; lambda0 -= amt; if (Math.abs(amt / lambda0) < BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT) { if (lambda0 > BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT) { return lambda0; } break; } } return blastKarlinLambdaBis(prob, sm.getMin(), sm.getMax()); } /** * Convergent method to find lambda. * * @param prob * Scores probabilities. * @param min * minimum score value. * @param max * max score value. * @return lambda value. */ private Double blastKarlinLambdaBis(Map<Integer, Double> prob, int min, int max) { if (!checkScoreRange(min, max)) { return null; } double up = BLAST_KARLIN_LAMBDA0_DEFAULT; double lambda = 0.0; for (;;) { up *= 2; double x0 = Math.exp(up); double x1 = Math.pow(x0, min - 1); double sum = 0.0; if (x1 > 0.0) { for (int i = min; i <= max; i++) { x1 *= x0; sum += prob.get(i) * x1; } } else { for (int i = min; i <= max; i++) { sum += prob.get(i) * Math.exp(up * i); } } if (sum >= 1.0) { break; } lambda = up; } for (int j = 0; j < BLAST_KARLIN_LAMBDA_ITER_DEFAULT; j++) { double newval = (lambda + up) / 2.0; double x0 = Math.exp(newval); double x1 = Math.pow(x0, min - 1); double sum = 0.0; if (x1 > 0.0) { for (int i = min; i <= max; i++) { x1 *= x0; sum += prob.get(i) * x1; } } else { for (int i = min; i <= max; i++) { sum += prob.get(i) * Math.exp(newval * i); } } if (sum > 1.0) { up = newval; } else { lambda = newval; } } return (lambda + up) / 2.; } /** * Calculate the H value. * * @param prob * Scores probabilities. * * @param lambda * lambda value. * * @param mismatch * mismatch score. * @param match * match score. * @return value of H. */ private double blastH(Map<Integer, Double> prob, double lambda, int min, int max) { if (lambda < 0.0) { return -1.0; } if (!checkScoreRange(min, max)) { return -1.0; } double etolam = Math.exp(lambda); double etolami = Math.pow(etolam, min - 1); double av = 0.0; if (etolami > 0.0) { for (int score = min; score <= max; score++) { etolami *= etolam; av += prob.get(score) * score * etolami; } } else { for (int score = min; score <= max; score++) { av += prob.get(score) * score * Math.exp(lambda * score); } } return lambda * av; } private int DIMOFP0(int iter, int range) { return iter * range + 1; } /** * Calculate the H value. * * @param prob * Scores probabilities. * @param lambda * lambda value. * @param h * value of H. * @param mismatch * mismatch score. * @param match * match score. * @return value of K. */ private double blastK(Map<Integer, Double> prob, double lambda, double h, int min, int max) { if (lambda <= 0.0 || h <= 0.0) { return -1.0; } double av = h / lambda; double etolam = Math.exp(lambda); if (min == -1 || max == 1) { double K = av; return K * (1.0 - 1.0 / etolam); } int low = min; int high = max; int range = high - low; double sumlimit = BLAST_KARLIN_K_SUMLIMIT_DEFAULT; int iter = BLAST_KARLIN_K_ITER_MAX; if (DIMOFP0(iter, range) > DIMOFP0_MAX) { return -1; } double[] P0 = new double[DIMOFP0(iter, range)]; double Sum = 0.0; int lo = 0; int hi = 0; double sum = 1.0; double oldsum = 1.0; double oldsum2 = 1.0; P0[0] = 1.0; int p = low; int first; int last; long j; for (j = 0; j < iter & sum > sumlimit; Sum += sum /= ++j) { first = last = range; lo += low; hi += high; int ptrP; for (ptrP = hi - lo; ptrP >= 0; P0[ptrP--] = sum) { int ptr1 = ptrP - first; int ptr1e = ptrP - last; int ptr2 = p + first; for (sum = 0.0; ptr1 >= ptr1e;) { sum += (P0[ptr1--] * prob.get(ptr2++)); } if (first != 0) { --first; } if (ptrP <= range) { --last; } } double etolami = Math.pow(etolam, lo - 1); int i; for (sum = 0.0, i = lo; i != 0; ++i) { etolami *= etolam; sum += P0[++ptrP] * etolami; } for (; i <= hi; ++i) { sum += P0[++ptrP]; } oldsum2 = oldsum; oldsum = sum; } /* Terms of geometric progression added for correction */ double ratio = oldsum / oldsum2; if (ratio >= (1.0 - sumlimit * 0001)) { return -1; } sumlimit *= 0.01; while (sum > sumlimit) { oldsum *= ratio; Sum += sum = oldsum / ++j; } int i; // Look for the greatest common divisor ("delta" in Appendix of PNAS 87 of Karlin&Altschul // (1990) for (i = 0, j = -low; i <= range && j > 1; ++i) { if (prob.get(p + i) != 0) { j = gcd(j, i); } } if (j * etolam > 0.05) { double etolami = Math.pow(etolam, -j); return j * Math.exp(-2.0 * Sum) / (av * (1.0 - etolami)); } else { return -j * Math.exp(-2.0 * Sum) / (av * Math.exp(-j * lambda) - 1); } } private boolean checkScoreRange(int min, int max) { if (min < MIN_SCORE || max > MAX_SCORE) { return false; } if (max - min > MAX_SCORE - MIN_SCORE) { return false; } return true; } /** * Adjust the query length. * * @param K * value of the K * @param LogK * value of the log of K * @param H * value of H * @param querySize * the length of the query * * @param databaseSize * the size, total of bases, in the data bank. * @param numberOfSequences * quantity of sequences in the data bank. * @return Length adjusted. */ private double lengthAdjust(double K, double LogK, double H, int querySize, long databaseSize, long numberOfSequences) { double lenghtAdjust = 0; double minimumQueryLength = 1 / K; for (int count = 0; count < 5; count++) { lenghtAdjust = (LogK + Math.log((querySize - lenghtAdjust) * (databaseSize - numberOfSequences * lenghtAdjust))) / H; if (lenghtAdjust > querySize - minimumQueryLength) { lenghtAdjust = querySize - minimumQueryLength; } } return lenghtAdjust; } /** * Normalize the score. * * @param nominalScore * nominal alignment score. * @return Alignment score normalized. */ public double nominalToNormalizedScore(double nominalScore) { return ((nominalScore * this.lambda) - this.logK) / LOG_2; } /** * Calculate the E-value from the normalized score. * * @param normalizedScore * @return E-Value. */ public double calculateEvalue(double normalizedScore) { return this.searchSpaceSize / Math.pow(2, normalizedScore); } /** * E-Value to the score. * * @param evalue * @return nominal score. */ public double gappedEvalueToNominal(double evalue) { double normalizedScore = Math.log(this.searchSpaceSize / evalue) / LOG_2; return Math.ceil((LOG_2 * normalizedScore + this.logK) / lambda); } /** * Look for the greatest common divisor */ public long gcd(long a, long b) { long c; b = Math.abs(b); if (b > a) { c = a; a = b; b = c; } while (b != 0) { c = a % b; a = b; b = c; } return a; } private final Map<Integer, Double> probabilities; private final double lambda; private final double H; private final double K; private final double logK; private final double effectiveQuerySize; private final double effectiveDatabaseSize; private final double searchSpaceSize; private final double lengthAdjust; private final Alphabet alphabet; /** * Create the statistics values from the giver query sequence, the data bank size and number of * sequences in this data bank. * * @param match * Scoring to a match. * @param mismatch * Scoring to a mismatch. * @param query * Input query sequence. * @param dataBankSize * quantity of bases in the all data bank sequences. * @param numberOfSequences * quantity of sequences in the data bank. */ public SubstitutionMatrixStatistics(Alphabet alphabet, SubstitutionMatrix sm , SymbolList query, long dataBankSize, long numberOfSequences) throws IndexOutOfBoundsException { this.alphabet = alphabet; this.probabilities = scoreProbabilities(sm, query); this.lambda = calculateLambda(probabilities, sm); this.H = blastH(probabilities, lambda, sm.getMin(), sm.getMax()); this.K = blastK(probabilities, lambda, H, sm.getMin(), sm.getMax()); this.logK = Math.log(K); this.lengthAdjust = lengthAdjust(K, logK, H, query.getLength(), dataBankSize, numberOfSequences); this.effectiveQuerySize = query.getLength() - lengthAdjust; this.effectiveDatabaseSize = dataBankSize - numberOfSequences * lengthAdjust; this.searchSpaceSize = effectiveQuerySize * effectiveDatabaseSize; } public void print_values(int match) { System.out.println((int) Math.floor(gappedEvalueToNominal(100) / match)); System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(100))))); System.out.println((int) Math.floor(gappedEvalueToNominal(10) / match)); System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(10))))); System.out.println((int) Math.floor(gappedEvalueToNominal(1) / match)); System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(1))))); System.out.println((int) Math.floor(gappedEvalueToNominal(0.1) / match)); System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.1))))); System.out.println((int) Math.floor(gappedEvalueToNominal(0.001) / match)); System.out.println(0.001); System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.001))))); System.out.println((int) Math.floor(gappedEvalueToNominal(0.00001) / match)); System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.00001))))); System.out.println((int) Math.floor(gappedEvalueToNominal(0.0000001) / match)); System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.0000001))))); System.out.println((int) Math.floor(gappedEvalueToNominal(0.0000000001) / match)); System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.0000000001))))); System.out.println((int) Math.floor(gappedEvalueToNominal(0.000000000001) / match)); System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.000000000001))))); System.out.println((int) Math.floor(gappedEvalueToNominal(0.000000000000001) / match)); System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.000000000000001))))); System.out.println((int) Math.floor(gappedEvalueToNominal(0.00000000000000001) / match)); System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.00000000000000001))))); System.out.println((int) Math.floor(gappedEvalueToNominal(0.0000000000000000001) / match)); System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.0000000000000000001))))); System.out.println((int) Math.floor(gappedEvalueToNominal(0.0000000000000000000001) / match)); System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.0000000000000000000001))))); System.out.println("lambda: " + lambda); System.out.println("H: " + H); System.out.println("K: " + K); System.out.println("Length Adjust: " + lengthAdjust); System.out.println("Effective query size: " + effectiveQuerySize); System.out.println("Effective database size: " + effectiveDatabaseSize); System.out.println("Search space size: " + searchSpaceSize); } }