/*
* Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br)
* Copyright (C) 2008,2009 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com)
*
* For further information check the LICENSE file.
*/
package bio.pih.genoogle.statistics;
import java.util.Map;
import bio.pih.genoogle.encoder.SequenceEncoder;
import bio.pih.genoogle.seq.Alphabet;
import bio.pih.genoogle.seq.SymbolList;
import com.google.common.collect.Maps;
/**
* Class to calculate statistics values for similar sequences searching process.
*
* @author albrecht (felipe.albrecht@gmail.com)
**/
/*
*
* This code is a Java implementation from the karlin.c file from the FSA-Blast, which copied from
* blastkar.c and modified to make it work standalone without using implementations of mathematical
* functions in ncbimath.c and other definitions in the NCBI BLAST toolkit.
*
* See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical Significance of Molecular
* Sequence Features by Using General Scoring Schemes," Proc. Natl. Acad. Sci. USA 87 (1990),
* 2264-2268.
*
* Computes the parameters lambda and K for use in calculating the statistical significance of
* high-scoring segments or subalignments.
*
* The scoring scheme must be integer valued. A positive score must be possible, but the expected
* (mean) score must be negative.
*
* A program that calls this routine must provide the value of the lowest possible score, the value
* of the greatest possible score, and a pointer to an array of probabilities for the occurence of
* all scores between these two extreme scores. For example, if score -2 occurs with probability
* 0.7, score 0 occurs with probability 0.1, and score 3 occurs with probability 0.2, then the
* subroutine must be called with low = -2, high = 3, and pr pointing to the array of values { 0.7,
* 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide pointers to lambda and K; the
* subroutine will then calculate the values of these two parameters. In this example, lambda=0.330
* and K=0.154.
*
* The parameters lambda and K can be used as follows. Suppose we are given a length N random
* sequence of independent letters. Associated with each letter is a score, and the probabilities of
* the letters determine the probability for each score. Let S be the aggregate score of the highest
* scoring contiguous segment of this sequence. Then if N is sufficiently large (greater than 100),
* the following bound on the probability that S is greater than or equal to x applies:
*
* P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ].
*
* In other words, the p-value for this segment can be written as 1-exp[-KN*exp(-lambda*S)].
*
* This formula can be applied to pairwise sequence comparison by assigning scores to pairs of
* letters (e.g. amino acids), and by replacing N in the formula with N*M, where N and M are the
* lengths of the two sequences being compared.
*
* In addition, letting y = KN*exp(-lambda*S), the p-value for finding m distinct segments all with
* score >= S is given by:
*
* 2 m-1 -y 1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e
*
* Notice that for m=1 this formula reduces to 1-exp(-y), which is the same as the previous formula.
*/
public class MatchDismatchStatistics implements Statistics {
/**
* Calculate the probability of each score and return a {@link Map} with the probabilities.
*
* @param mismatch
* Match score.
* @param match
* Mismatch score.
* @param query
* Input query sequence
* @return {@link Map} which the probability of each score.
*/
private Map<Integer, Double> scoreProbabilities(int mismatch, int match, SymbolList query)
throws IndexOutOfBoundsException {
int[][] baseValue = new int[4][4];
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
if (i == j) {
baseValue[i][j] = match;
} else {
baseValue[i][j] = mismatch;
}
}
}
int min = Math.min(mismatch, match);
int max = Math.max(mismatch, match);
int delta;
if (min < 0) {
delta = Math.abs(min);
} else {
delta = -min;
}
int scoreProbabilitiesSize = (min + delta) + (max + delta) + 1;
double[] scoreProbabilities = new double[scoreProbabilitiesSize];
for (int i = mismatch + delta; i <= match + delta; i++) {
scoreProbabilities[i] = 0.0;
}
int numRegularLettersInQuery = 0;
int length = query.getLength();
for (int i = 1; i <= length; i++) {
if (alphabet.isValid(query.symbolAt(i))) {
numRegularLettersInQuery++;
}
}
for (int i = 1; i <= length; i++) {
double probability = 250.00 / numRegularLettersInQuery;
int querySymbolValue = encoder.getBitsFromChar(query.symbolAt(i));
for (Character c : alphabet.getLetters()) {
int symbolValue = encoder.getBitsFromChar(c);
int score = baseValue[querySymbolValue][symbolValue];
scoreProbabilities[score + delta] += probability;
}
}
final double sum = 1000.00;
for (int i = mismatch + delta; i <= match + delta; i++) {
double probability = scoreProbabilities[i];
scoreProbabilities[i] = probability / sum;
}
Map<Integer, Double> scoreProbabilitiesMap = Maps.newHashMap();
for (int i = mismatch; i <= match; i++) {
scoreProbabilitiesMap.put(i, scoreProbabilities[i + delta]);
}
return scoreProbabilitiesMap;
}
/**
* Calculate the lambda from the scores.
*
* @param prob
* Scores probabilities.
* @param mismatch
* mismatch score.
* @param match
* match score.
* @return Lambda value.
*/
private Double calculateLambda(Map<Integer, Double> prob, int mismatch, int match) {
if (!checkScoreRange(mismatch, match)) {
return null;
}
double lambda0 = BLAST_KARLIN_LAMBDA0_DEFAULT;
for (int j = 0; j < 20; j++) {
double sum = -1.0;
double slope = 0.0;
if (lambda0 < 0.01) {
break;
}
double x0 = Math.exp(lambda0);
double x1 = Math.pow(x0, mismatch - 1);
if (Double.compare(x1, 0.0) == 0) {
break;
}
for (int i = mismatch; i <= match; i++) {
x1 *= x0;
double temp = prob.get(i) * x1;
sum += temp;
slope += temp * i;
}
double amt = sum / slope;
lambda0 -= amt;
if (Math.abs(amt / lambda0) < BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT) {
if (lambda0 > BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT) {
return lambda0;
}
break;
}
}
return blastKarlinLambdaBis(prob, mismatch, match);
}
/**
* Convergent method to find lambda.
*
* @param prob
* Scores probabilities.
* @param min
* minimum score value.
* @param max
* max score value.
* @return lambda value.
*/
private Double blastKarlinLambdaBis(Map<Integer, Double> prob, int min, int max) {
if (!checkScoreRange(min, max)) {
return null;
}
double up = BLAST_KARLIN_LAMBDA0_DEFAULT;
double lambda = 0.0;
for (;;) {
up *= 2;
double x0 = Math.exp(up);
double x1 = Math.pow(x0, min - 1);
double sum = 0.0;
if (x1 > 0.0) {
for (int i = min; i <= max; i++) {
x1 *= x0;
sum += prob.get(i) * x1;
}
} else {
for (int i = min; i <= max; i++) {
sum += prob.get(i) * Math.exp(up * i);
}
}
if (sum >= 1.0) {
break;
}
lambda = up;
}
for (int j = 0; j < BLAST_KARLIN_LAMBDA_ITER_DEFAULT; j++) {
double newval = (lambda + up) / 2.0;
double x0 = Math.exp(newval);
double x1 = Math.pow(x0, min - 1);
double sum = 0.0;
if (x1 > 0.0) {
for (int i = min; i <= max; i++) {
x1 *= x0;
sum += prob.get(i) * x1;
}
} else {
for (int i = min; i <= max; i++) {
sum += prob.get(i) * Math.exp(newval * i);
}
}
if (sum > 1.0) {
up = newval;
} else {
lambda = newval;
}
}
return (lambda + up) / 2.;
}
/**
* Calculate the H value.
*
* @param prob
* Scores probabilities.
*
* @param lambda
* lambda value.
*
* @param mismatch
* mismatch score.
* @param match
* match score.
* @return value of H.
*/
private double blastH(Map<Integer, Double> prob, double lambda, int mismatch, int match) {
if (lambda < 0.0) {
return -1.0;
}
if (!checkScoreRange(mismatch, match)) {
return -1.0;
}
double etolam = Math.exp(lambda);
double etolami = Math.pow(etolam, mismatch - 1);
double av = 0.0;
if (etolami > 0.0) {
for (int score = mismatch; score <= match; score++) {
etolami *= etolam;
av += prob.get(score) * score * etolami;
}
} else {
for (int score = mismatch; score <= match; score++) {
av += prob.get(score) * score * Math.exp(lambda * score);
}
}
return lambda * av;
}
private int DIMOFP0(int iter, int range) {
return iter * range + 1;
}
/**
* Calculate the H value.
*
* @param prob
* Scores probabilities.
* @param lambda
* lambda value.
* @param h
* value of H.
* @param mismatch
* mismatch score.
* @param match
* match score.
* @return value of K.
*/
private double blastK(Map<Integer, Double> prob, double lambda, double h, int mismatch, int match) {
if (lambda <= 0.0 || h <= 0.0) {
return -1.0;
}
double av = h / lambda;
double etolam = Math.exp(lambda);
if (mismatch == -1 || match == 1) {
double K = av;
return K * (1.0 - 1.0 / etolam);
}
int low = mismatch;
int high = match;
int range = high - low;
double sumlimit = BLAST_KARLIN_K_SUMLIMIT_DEFAULT;
int iter = BLAST_KARLIN_K_ITER_MAX;
if (DIMOFP0(iter, range) > DIMOFP0_MAX) {
return -1;
}
double[] P0 = new double[DIMOFP0(iter, range)];
double Sum = 0.0;
int lo = 0;
int hi = 0;
double sum = 1.0;
double oldsum = 1.0;
double oldsum2 = 1.0;
P0[0] = 1.0;
int p = low;
int first;
int last;
long j;
for (j = 0; j < iter & sum > sumlimit; Sum += sum /= ++j) {
first = last = range;
lo += low;
hi += high;
int ptrP;
for (ptrP = hi - lo; ptrP >= 0; P0[ptrP--] = sum) {
int ptr1 = ptrP - first;
int ptr1e = ptrP - last;
int ptr2 = p + first;
for (sum = 0.0; ptr1 >= ptr1e;) {
sum += (P0[ptr1--] * prob.get(ptr2++));
}
if (first != 0) {
--first;
}
if (ptrP <= range) {
--last;
}
}
double etolami = Math.pow(etolam, lo - 1);
int i;
for (sum = 0.0, i = lo; i != 0; ++i) {
etolami *= etolam;
sum += P0[++ptrP] * etolami;
}
for (; i <= hi; ++i) {
sum += P0[++ptrP];
}
oldsum2 = oldsum;
oldsum = sum;
}
/* Terms of geometric progression added for correction */
double ratio = oldsum / oldsum2;
if (ratio >= (1.0 - sumlimit * 0001)) {
return -1;
}
sumlimit *= 0.01;
while (sum > sumlimit) {
oldsum *= ratio;
Sum += sum = oldsum / ++j;
}
int i;
// Look for the greatest common divisor ("delta" in Appendix of PNAS 87 of Karlin&Altschul
// (1990)
for (i = 0, j = -low; i <= range && j > 1; ++i) {
if (prob.get(p + i) != 0) {
j = gcd(j, i);
}
}
if (j * etolam > 0.05) {
double etolami = Math.pow(etolam, -j);
return j * Math.exp(-2.0 * Sum) / (av * (1.0 - etolami));
} else {
return -j * Math.exp(-2.0 * Sum) / (av * Math.exp(-j * lambda) - 1);
}
}
private boolean checkScoreRange(int min, int max) {
if (min < MIN_SCORE || max > MAX_SCORE) {
return false;
}
if (max - min > MAX_SCORE - MIN_SCORE) {
return false;
}
return true;
}
/**
* Adjust the query length.
*
* @param K
* value of the K
* @param LogK
* value of the log of K
* @param H
* value of H
* @param querySize
* the length of the query
*
* @param databaseSize
* the size, total of bases, in the data bank.
* @param numberOfSequences
* quantity of sequences in the data bank.
* @return Length adjusted.
*/
private double lengthAdjust(double K, double LogK, double H, int querySize, long databaseSize,
long numberOfSequences) {
double lenghtAdjust = 0;
double minimumQueryLength = 1 / K;
for (int count = 0; count < 5; count++) {
lenghtAdjust = (LogK + Math.log((querySize - lenghtAdjust)
* (databaseSize - numberOfSequences * lenghtAdjust)))
/ H;
if (lenghtAdjust > querySize - minimumQueryLength) {
lenghtAdjust = querySize - minimumQueryLength;
}
}
return lenghtAdjust;
}
/* (non-Javadoc)
* @see bio.pih.genoogle.statistics.Statistics#nominalToNormalizedScore(double)
*/
@Override
public double nominalToNormalizedScore(double nominalScore) {
return ((nominalScore * this.lambda) - this.logK) / LOG_2;
}
/* (non-Javadoc)
* @see bio.pih.genoogle.statistics.Statistics#calculateEvalue(double)
*/
@Override
public double calculateEvalue(double normalizedScore) {
return this.searchSpaceSize / Math.pow(2, normalizedScore);
}
/* (non-Javadoc)
* @see bio.pih.genoogle.statistics.Statistics#gappedEvalueToNominal(double)
*/
@Override
public double gappedEvalueToNominal(double evalue) {
double normalizedScore = Math.log(this.searchSpaceSize / evalue) / LOG_2;
return Math.ceil((LOG_2 * normalizedScore + this.logK) / lambda);
}
/* (non-Javadoc)
* @see bio.pih.genoogle.statistics.Statistics#gcd(long, long)
*/
@Override
public long gcd(long a, long b) {
long c;
b = Math.abs(b);
if (b > a) {
c = a;
a = b;
b = c;
}
while (b != 0) {
c = a % b;
a = b;
b = c;
}
return a;
}
private final Map<Integer, Double> probabilities;
private final double lambda;
private final double H;
private final double K;
private final double logK;
private final double effectiveQuerySize;
private final double effectiveDatabaseSize;
private final double searchSpaceSize;
private final double lengthAdjust;
private final Alphabet alphabet;
private final SequenceEncoder encoder;
/**
* Create the statistics values from the giver query sequence, the data bank size and number of
* sequences in this data bank.
*
* @param match
* Scoring to a match.
* @param mismatch
* Scoring to a mismatch.
* @param query
* Input query sequence.
* @param dataBankSize
* quantity of bases in the all data bank sequences.
* @param numberOfSequences
* quantity of sequences in the data bank.
*/
public MatchDismatchStatistics(Alphabet alphabet, SequenceEncoder encoder, int match, int mismatch, SymbolList query, long dataBankSize,
long numberOfSequences) throws IndexOutOfBoundsException {
this.alphabet = alphabet;
this.encoder = encoder;
this.probabilities = scoreProbabilities(mismatch, match, query);
this.lambda = calculateLambda(probabilities, mismatch, match);
this.H = blastH(probabilities, lambda, mismatch, match);
this.K = blastK(probabilities, lambda, H, mismatch, match);
this.logK = Math.log(K);
this.lengthAdjust = lengthAdjust(K, logK, H, query.getLength(), dataBankSize, numberOfSequences);
this.effectiveQuerySize = query.getLength() - lengthAdjust;
this.effectiveDatabaseSize = dataBankSize - numberOfSequences * lengthAdjust;
this.searchSpaceSize = effectiveQuerySize * effectiveDatabaseSize;
}
public void print_values(int match) {
System.out.println((int) Math.floor(gappedEvalueToNominal(100) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(100)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(10) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(10)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(1) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(1)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.1) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.1)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.001) / match));
System.out.println(0.001);
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.00001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.00001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.0000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.0000001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.0000000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.0000000001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.000000000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.000000000001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.000000000000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.000000000000001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.00000000000000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.00000000000000001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.0000000000000000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.0000000000000000001)))));
System.out.println((int) Math.floor(gappedEvalueToNominal(0.0000000000000000000001) / match));
System.out.println(calculateEvalue(nominalToNormalizedScore(Math.floor(gappedEvalueToNominal(0.0000000000000000000001)))));
System.out.println("lambda: " + lambda);
System.out.println("H: " + H);
System.out.println("K: " + K);
System.out.println("Length Adjust: " + lengthAdjust);
System.out.println("Effective query size: " + effectiveQuerySize);
System.out.println("Effective database size: " + effectiveDatabaseSize);
System.out.println("Search space size: " + searchSpaceSize);
}
}