package org.seqcode.data.motifdb; import java.util.List; import java.util.Set; import org.seqcode.genome.Genome; import org.seqcode.gseutils.NotFoundException; import org.seqcode.gseutils.Pair; import org.seqcode.math.stats.Fmath; import cern.colt.matrix.*; import cern.colt.matrix.impl.*; import cern.colt.matrix.linalg.Algebra; import cern.jet.math.Functions; /** * @author mahony * @author rca This class represents a background model where the values are * conditional probabilities. For dinucleotides, for example, p(A|A) + * p(C|A) + p(G|A) + p(T|A) = 1. */ public class MarkovBackgroundModel extends BackgroundModel { /** * Construct a new MarkovBackgroundModel from the supplied metadata object * @param md the metadata describing this model * @throws NotFoundException if a Genome can't be found for the metadata's * genome ID */ public MarkovBackgroundModel(BackgroundModelMetadata md) throws NotFoundException { super(md); if (!BackgroundModelLoader.MARKOV_TYPE_STRING.equals(md.getDBModelType())) { throw new IllegalArgumentException("Metadata model type must be MARKOV"); } } /** * Construct a new MarkovBackground model with the specified name, for the * specified genome, and with a default max kmer length * @param name * @param gen */ public MarkovBackgroundModel(String name, Genome gen) { super(name, gen, BackgroundModelLoader.MARKOV_TYPE_STRING); } /** * Construct a new MarkovBackground model with the specified name, for the * specified genome, and with the specified max kmer length * @param name * @param gen * @param maxKmerLen the max Kmer Length for the model */ public MarkovBackgroundModel(String name, Genome gen, int maxKmerLen) { super(name, gen, maxKmerLen, BackgroundModelLoader.MARKOV_TYPE_STRING); } /** * Construct a Markov Background Model from an existing Frequency Background * Model. * @param fbg the existing frequency background model */ public MarkovBackgroundModel(FrequencyBackgroundModel fbg) { super(fbg, BackgroundModelLoader.MARKOV_TYPE_STRING); //iterate over each order level of the model for (int i = 1; i <= this.getMaxKmerLen(); i++) { //iterate over all sets of conditions for that order for (int k = 0; k < (int) Math.pow(4, i); k += 4) { Double total = 0.0; //iterate over the 4 outcomes for the conditional probability //summing up the values for (int b = 0; b < 4; b++) { String currKmer = int2seq(k + b, i); total += fbg.getFrequency(currKmer); } //if the total is 0 skip these kmers and keep the model sparse if (total > 0) { //iterate over the 4 outcomes normalizing for (int b = 0; b < 4; b++) { String currKmer = int2seq(k + b, i); modelProbs[i].put(currKmer, fbg.getFrequency(currKmer) / total); } } } } } /** * set the hasCounts fields of the metadata */ protected void init() { this.hasCounts = false; } /** * @see BackgroundModel */ public Set<String> getKmers(int kmerLen) { return modelProbs[kmerLen].keySet(); } /** * Construct a Markov Background Model from an existing Counts Background * Model. * @param cbg */ public MarkovBackgroundModel(CountsBackgroundModel cbg) { this(new FrequencyBackgroundModel(cbg)); } /** * Return the markov probability for the last base of the specified kmer * conditioned upon the preceding bases. */ public double getMarkovProb(String kmer) { if (modelProbs[kmer.length()].containsKey(kmer)) { return modelProbs[kmer.length()].get(kmer); } else { return 0.0; } } /** * Sets the 4 markov probabilities that are conditioned on the specified * string of previous bases. * @param prevBases The bases that the probabilities are conditionally * dependent upon * @param aProb P( A | prevBases) * @param cProb P( C | prevBases) * @param gProb P( G | prevBases) * @param tProb P( T | prevBases) */ public void setMarkovProb(String prevBases, double aProb, double cProb, double gProb, double tProb) { prevBases = prevBases.toUpperCase(); if (BackgroundModel.isKmerValid(prevBases)) { double total = aProb + cProb + gProb + tProb; //if ((aProb >= 0) && (cProb >= 0) && (gProb >= 0) && (tProb >= 0) // && (Fmath.isEqualWithinLimits(total, 1.0, BackgroundModel.EPSILON) // || (total == 0))) { int kmerLen = prevBases.length() + 1; modelProbs[kmerLen].put(prevBases + "A", aProb/total); modelProbs[kmerLen].put(prevBases + "C", cProb/total); modelProbs[kmerLen].put(prevBases + "G", gProb/total); modelProbs[kmerLen].put(prevBases + "T", tProb/total); //} //else { // throw new IllegalArgumentException("Probabilities must sum to 1 or must all be 0, but instead sum to " + total + " for prevBases " + prevBases); //} //reset the isStranded variable to null to indicate unknown strandedness isStranded = null; } else { throw new IllegalArgumentException("Previous Bases must consist of zero or more DNA bases, but is: " + prevBases); } } /** * Check if this model is normalized properly. * @return an array of string containing the kmers for which the model isn't * normalized, or null if it is normalized correctly */ public String[] verifyNormalization() { //iterate over each order level of the model for (int i = 1; i <= this.getMaxKmerLen(); i++) { //iterate over all sets of conditions for that order for (int k = 0; k < (int) Math.pow(4, i); k += 4) { Double total = 0.0; //iterate over the 4 outcomes for the conditional probability //summing up the values String[] currMers = new String[4]; for (int b = 0; b < 4; b++) { String currMer = int2seq(k + b, i); currMers[b] = currMer; total += this.getMarkovProb(currMer); } if (!Fmath.isEqualWithinLimits(total, 1.0, 1E-6)) { return currMers; } } } return null; } /** * Checks whether this model is based on a single strand of sequence or * double-stranded sequence. For the Markov model a linear system can be * created from the log-space version of the equations for normalizing the * model and equations indicating the equality of reverse complement kmers. * The system will be underconstrained, so there will always be a solution, * but once converted back to linear-space additional constraints can be * verified to determine whether the solution is valid. */ public boolean checkAndSetIsStranded() { int currKmerLen = 1; while (currKmerLen <= modelProbs.length) { //set up and solve the log-space linear system for the current kmers List<Pair<Integer, Integer>> revCompPairs = BackgroundModel.computeDistinctRevCompPairs(currKmerLen); int numCurrKmers = (int)Math.pow(4, currKmerLen); DoubleMatrix2D linSys = MarkovBackgroundModel.createStrandednessLinearSystem(numCurrKmers, revCompPairs); DoubleMatrix1D logMarkov = new DenseDoubleMatrix1D(numCurrKmers); for (int i = 0; i < logMarkov.size(); i++) { Double prob = this.getMarkovProb(i, currKmerLen); if (prob != null) { logMarkov.setQuick(i, Math.exp(prob)); } else { logMarkov.setQuick(i, -Double.MAX_VALUE); } } Algebra algebra = new Algebra(); DoubleMatrix1D sol = algebra.mult(algebra.inverse(linSys), logMarkov).assign(Functions.exp); //now check that the rev. comp components of the solution are matched for (Pair<Integer, Integer> rcPair : revCompPairs) { if (!Fmath.isEqualWithinLimits(sol.getQuick(rcPair.car()), sol.getQuick(rcPair.cdr()), BackgroundModel.EPSILON)) { isStranded = true; return isStranded; } } //finally, check that the sums used to normalize match correctly for (int i = 0; i < numCurrKmers; i+= 4) { double sum = sol.viewPart(i, 4).zSum(); int solSumIndex = numCurrKmers + (i / 4); if (!Fmath.isEqualWithinLimits(sum, sol.getQuick(solSumIndex), BackgroundModel.EPSILON)) { isStranded = true; return isStranded; } } currKmerLen++; } /** * If all kmerlens have been checked and the method hasn't returned then the * model is not stranded */ isStranded = false; return isStranded; } /** * Create a linear system of the log-space versions of the equations used to * normalize a frequency/count model as a markov model. * @param numKmers * @param revCompPairs * @return */ private static DoubleMatrix2D createStrandednessLinearSystem(int numKmers, List<Pair<Integer, Integer>> revCompPairs) { int numSeqGroups = numKmers / 4; DoubleMatrix2D linSys = new SparseDoubleMatrix2D(numKmers + revCompPairs.size(), (numKmers + numSeqGroups)); for (int i = 0; i < numKmers; i++) { //set the kmer's entry for itself linSys.setQuick(i, i, 1); //set the kmer's entry for the kmer group sum int groupIndex = numKmers + (i / 4); linSys.setQuick(i, groupIndex, -1); } for (int i = 0; i < revCompPairs.size(); i++) { Pair<Integer, Integer> revComps = revCompPairs.get(i); linSys.setQuick(numKmers + i, revComps.car(), 1); linSys.setQuick(numKmers + i, revComps.cdr(), -1); } return linSys; } }