/*
* Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br)
* Copyright (C) 2008,2009,2010,2011,2012 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com)
*
* For further information check the LICENSE file.
*/
package bio.pih.genoogle.statistics;
import java.util.Map;
import bio.pih.genoogle.alignment.SubstitutionMatrix;
import bio.pih.genoogle.seq.Alphabet;
import bio.pih.genoogle.seq.SymbolList;
import com.google.common.collect.Maps;
/**
* Class to calculate statistics values for similar sequences searching process.
*
* @author albrecht (felipe.albrecht@gmail.com)
**/
public class SubstitutionMatrixStatistics implements Statistics {
/**
* Calculate the probability of each score and return a {@link Map} with the probabilities.
*/
private Map<Integer, Double> scoreProbabilities(SubstitutionMatrix sm, SymbolList query)
throws IndexOutOfBoundsException {
int min = sm.getMin();
int max = sm.getMax();
int delta;
if (min < 0) {
delta = Math.abs(min);
} else {
delta = -min;
}
int scoreProbabilitiesSize = (min + delta) + (max + delta) + 1;
double[] scoreProbabilities = new double[scoreProbabilitiesSize];
for (int i = min + delta; i <= max + delta; i++) {
scoreProbabilities[i] = 0.0;
}
int numRegularLettersInQuery = 0;
int length = query.getLength();
for (int i = 1; i <= length; i++) {
if (alphabet.isValid(query.symbolAt(i))) {
numRegularLettersInQuery++;
}
}
double probPrior = 1000 / sm.getSymbolsCount();
double probability = probPrior / numRegularLettersInQuery;
for (int i = 1; i <= length; i++) {
char a = query.symbolAt(i);
for (char c : sm.getSymbols()) {
int score = sm.getValue(a, c);
scoreProbabilities[score + delta] += probability;
}
}
final double sum = 1000.00;
for (int i = min + delta; i <= max+ delta; i++) {
double p = scoreProbabilities[i];
scoreProbabilities[i] = p / sum;
}
Map<Integer, Double> scoreProbabilitiesMap = Maps.newHashMap();
for (int i = min; i <= max; i++) {
scoreProbabilitiesMap.put(i, scoreProbabilities[i + delta]);
}
return scoreProbabilitiesMap;
}
/**
* Calculate the lambda from the scores.
*
* @param prob
* Scores probabilities.
* @param mismatch
* mismatch score.
* @param match
* match score.
* @return Lambda value.
*/
private Double calculateLambda(Map<Integer, Double> prob, SubstitutionMatrix sm) {
double lambda0 = BLAST_KARLIN_LAMBDA0_DEFAULT;
for (int j = 0; j < 20; j++) {
double sum = -1.0;
double slope = 0.0;
if (lambda0 < 0.01) {
break;
}
double x0 = Math.exp(lambda0);
double x1 = Math.pow(x0, sm.getMin() - 1);
if (Double.compare(x1, 0.0) == 0) {
break;
}
for (int i = sm.getMin(); i <= sm.getMax(); i++) {
x1 *= x0;
double temp = prob.get(i) * x1;
sum += temp;
slope += temp * i;
}
double amt = sum / slope;
lambda0 -= amt;
if (Math.abs(amt / lambda0) < BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT) {
if (lambda0 > BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT) {
return lambda0;
}
break;
}
}
return blastKarlinLambdaBis(prob, sm.getMin(), sm.getMax());
}
/**
* Convergent method to find lambda.
*
* @param prob
* Scores probabilities.
* @param min
* minimum score value.
* @param max
* max score value.
* @return lambda value.
*/
private Double blastKarlinLambdaBis(Map<Integer, Double> prob, int min, int max) {
if (!checkScoreRange(min, max)) {
return null;
}
double up = BLAST_KARLIN_LAMBDA0_DEFAULT;
double lambda = 0.0;
for (;;) {
up *= 2;
double x0 = Math.exp(up);
double x1 = Math.pow(x0, min - 1);
double sum = 0.0;
if (x1 > 0.0) {
for (int i = min; i <= max; i++) {
x1 *= x0;
sum += prob.get(i) * x1;
}
} else {
for (int i = min; i <= max; i++) {
sum += prob.get(i) * Math.exp(up * i);
}
}
if (sum >= 1.0) {
break;
}
lambda = up;
}
for (int j = 0; j < BLAST_KARLIN_LAMBDA_ITER_DEFAULT; j++) {
double newval = (lambda + up) / 2.0;
double x0 = Math.exp(newval);
double x1 = Math.pow(x0, min - 1);
double sum = 0.0;
if (x1 > 0.0) {
for (int i = min; i <= max; i++) {
x1 *= x0;
sum += prob.get(i) * x1;
}
} else {
for (int i = min; i <= max; i++) {
sum += prob.get(i) * Math.exp(newval * i);
}
}
if (sum > 1.0) {
up = newval;
} else {
lambda = newval;
}
}
return (lambda + up) / 2.;
}
/**
* Calculate the H value.
*
* @param prob
* Scores probabilities.
*
* @param lambda
* lambda value.
*
* @param mismatch
* mismatch score.
* @param match
* match score.
* @return value of H.
*/
private double blastH(Map<Integer, Double> prob, double lambda, int min, int max) {
if (lambda < 0.0) {
return -1.0;
}
if (!checkScoreRange(min, max)) {
return -1.0;
}
double etolam = Math.exp(lambda);
double etolami = Math.pow(etolam, min - 1);
double av = 0.0;
if (etolami > 0.0) {
for (int score = min; score <= max; score++) {
etolami *= etolam;
av += prob.get(score) * score * etolami;
}
} else {
for (int score = min; score <= max; score++) {
av += prob.get(score) * score * Math.exp(lambda * score);
}
}
return lambda * av;
}
private int DIMOFP0(int iter, int range) {
return iter * range + 1;
}
/**
* Calculate the H value.
*
* @param prob
* Scores probabilities.
* @param lambda
* lambda value.
* @param h
* value of H.
* @param mismatch
* mismatch score.
* @param match
* match score.
* @return value of K.
*/
private double blastK(Map<Integer, Double> prob, double lambda, double h, int min, int max) {
if (lambda <= 0.0 || h <= 0.0) {
return -1.0;
}
double av = h / lambda;
double etolam = Math.exp(lambda);
if (min == -1 || max == 1) {
double K = av;
return K * (1.0 - 1.0 / etolam);
}
int low = min;
int high = max;
int range = high - low;
double sumlimit = BLAST_KARLIN_K_SUMLIMIT_DEFAULT;
int iter = BLAST_KARLIN_K_ITER_MAX;
if (DIMOFP0(iter, range) > DIMOFP0_MAX) {
return -1;
}
double[] P0 = new double[DIMOFP0(iter, range)];
double Sum = 0.0;
int lo = 0;
int hi = 0;
double sum = 1.0;
double oldsum = 1.0;
double oldsum2 = 1.0;
P0[0] = 1.0;
int p = low;
int first;
int last;
long j;
for (j = 0; j < iter & sum > sumlimit; Sum += sum /= ++j) {
first = last = range;
lo += low;
hi += high;
int ptrP;
for (ptrP = hi - lo; ptrP >= 0; P0[ptrP--] = sum) {
int ptr1 = ptrP - first;
int ptr1e = ptrP - last;
int ptr2 = p + first;
for (sum = 0.0; ptr1 >= ptr1e;) {
sum += (P0[ptr1--] * prob.get(ptr2++));
}
if (first != 0) {
--first;
}
if (ptrP <= range) {
--last;
}
}
double etolami = Math.pow(etolam, lo - 1);
int i;
for (sum = 0.0, i = lo; i != 0; ++i) {
etolami *= etolam;
sum += P0[++ptrP] * etolami;
}
for (; i <= hi; ++i) {
sum += P0[++ptrP];
}
oldsum2 = oldsum;
oldsum = sum;
}
/* Terms of geometric progression added for correction */
double ratio = oldsum / oldsum2;
if (ratio >= (1.0 - sumlimit * 0001)) {
return -1;
}
sumlimit *= 0.01;
while (sum > sumlimit) {
oldsum *= ratio;
Sum += sum = oldsum / ++j;
}
int i;
// Look for the greatest common divisor ("delta" in Appendix of PNAS 87 of Karlin&Altschul
// (1990)
for (i = 0, j = -low; i <= range && j > 1; ++i) {
if (prob.get(p + i) != 0) {
j = gcd(j, i);
}
}
if (j * etolam > 0.05) {
double etolami = Math.pow(etolam, -j);
return j * Math.exp(-2.0 * Sum) / (av * (1.0 - etolami));
} else {
return -j * Math.exp(-2.0 * Sum) / (av * Math.exp(-j * lambda) - 1);
}
}
private boolean checkScoreRange(int min, int max) {
if (min < MIN_SCORE || max > MAX_SCORE) {
return false;
}
if (max - min > MAX_SCORE - MIN_SCORE) {
return false;
}
return true;
}
/**
* Adjust the query length.
*
* @param K
* value of the K
* @param LogK
* value of the log of K
* @param H
* value of H
* @param querySize
* the length of the query
*
* @param databaseSize
* the size, total of bases, in the data bank.
* @param numberOfSequences
* quantity of sequences in the data bank.
* @return Length adjusted.
*/
private double lengthAdjust(double K, double LogK, double H, int querySize, long databaseSize,
long numberOfSequences) {
double lenghtAdjust = 0;
double minimumQueryLength = 1 / K;
for (int count = 0; count < 5; count++) {
lenghtAdjust = (LogK + Math.log((querySize - lenghtAdjust)
* (databaseSize - numberOfSequences * lenghtAdjust)))
/ H;
if (lenghtAdjust > querySize - minimumQueryLength) {
lenghtAdjust = querySize - minimumQueryLength;
}
}
return lenghtAdjust;
}
/**
* Normalize the score.
*
* @param nominalScore
* nominal alignment score.
* @return Alignment score normalized.
*/
public double nominalToNormalizedScore(double nominalScore) {
return ((nominalScore * this.lambda) - this.logK) / LOG_2;
}
/**
* Calculate the E-value from the normalized score.
*
* @param normalizedScore
* @return E-Value.
*/
public double calculateEvalue(double normalizedScore) {
return this.searchSpaceSize / Math.pow(2, normalizedScore);
}
/**
* E-Value to the score.
*
* @param evalue
* @return nominal score.
*/
public double gappedEvalueToNominal(double evalue) {
double normalizedScore = Math.log(this.searchSpaceSize / evalue) / LOG_2;
return Math.ceil((LOG_2 * normalizedScore + this.logK) / lambda);
}
/**
* Look for the greatest common divisor
*/
public long gcd(long a, long b) {
long c;
b = Math.abs(b);
if (b > a) {
c = a;
a = b;
b = c;
}
while (b != 0) {
c = a % b;
a = b;
b = c;
}
return a;
}
private final Map<Integer, Double> probabilities;
private final double lambda;
private final double H;
private final double K;
private final double logK;
private final double effectiveQuerySize;
private final double effectiveDatabaseSize;
private final double searchSpaceSize;
private final double lengthAdjust;
private final Alphabet alphabet;
/**
* Create the statistics values from the giver query sequence, the data bank size and number of
* sequences in this data bank.
*
* @param match
* Scoring to a match.
* @param mismatch
* Scoring to a mismatch.
* @param query
* Input query sequence.
* @param dataBankSize
* quantity of bases in the all data bank sequences.
* @param numberOfSequences
* quantity of sequences in the data bank.
*/
public SubstitutionMatrixStatistics(Alphabet alphabet, SubstitutionMatrix sm , SymbolList query, long dataBankSize,
long numberOfSequences) throws IndexOutOfBoundsException {
this.alphabet = alphabet;
this.probabilities = scoreProbabilities(sm, query);
this.lambda = calculateLambda(probabilities, sm);
this.H = blastH(probabilities, lambda, sm.getMin(), sm.getMax());
this.K = blastK(probabilities, lambda, H, sm.getMin(), sm.getMax());
this.logK = Math.log(K);
this.lengthAdjust = lengthAdjust(K, logK, H, query.getLength(), dataBankSize, numberOfSequences);
this.effectiveQuerySize = query.getLength() - lengthAdjust;
this.effectiveDatabaseSize = dataBankSize - numberOfSequences * lengthAdjust;
this.searchSpaceSize = effectiveQuerySize * effectiveDatabaseSize;
}
public void print_values(int match) {
System.out.println((int) Math.floor(gappedEvalueToNominal(100) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(100)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(10) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(10)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(1) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(1)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.1) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.1)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.001) / match));
System.out.println(0.001);
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.00001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.00001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.0000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.0000001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.0000000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.0000000001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.000000000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.000000000001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.000000000000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.000000000000001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.00000000000000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.00000000000000001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.0000000000000000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.0000000000000000001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.0000000000000000000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.0000000000000000000001)))));
System.out.println("lambda: " + lambda);
System.out.println("H: " + H);
System.out.println("K: " + K);
System.out.println("Length Adjust: " + lengthAdjust);
System.out.println("Effective query size: " + effectiveQuerySize);
System.out.println("Effective database size: " + effectiveDatabaseSize);
System.out.println("Search space size: " + searchSpaceSize);
}
}