package org.seqcode.data.motifdb; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.seqcode.genome.Genome; import org.seqcode.gseutils.NotFoundException; import org.seqcode.gseutils.Pair; import org.seqcode.math.stats.Fmath; /** * @author mahony * @author rca This class represents a background model where the values are * frequencies. For example the probabilities of all 16 possible * dinucleotides sum to 1. */ public class FrequencyBackgroundModel extends BackgroundModel implements BackgroundModelFrequencySupport { /** * Construct a new FrequencyBackgroundModel from the supplied metadata object * @param md the metadata describing this model * @throws NotFoundException if a Genome can't be found for the metadata's * genome ID */ public FrequencyBackgroundModel(BackgroundModelMetadata md) throws NotFoundException { super(md); if (!BackgroundModelLoader.FREQUENCY_TYPE_STRING.equals(md.getDBModelType())) { throw new IllegalArgumentException("Metadata model type must be FREQUENCY"); } } /** * Construct a new FrequencyBackground model with the specified name, for the * specified genome, and with a default max kmer length * @param name * @param gen */ public FrequencyBackgroundModel(String name, Genome gen) { super(name, gen, BackgroundModelLoader.FREQUENCY_TYPE_STRING); } /** * Construct a new FrequencyBackground model with the specified name, for the * specified genome, and with the specified max kmer length * @param name * @param gen * @param maxKmerLen */ public FrequencyBackgroundModel(String name, Genome gen, int maxKmerLen) { super(name, gen, maxKmerLen, BackgroundModelLoader.FREQUENCY_TYPE_STRING); } /** * Construct a FrequencyBackgroundModel from an existing CountsBackgroundModel * @param cbg the existing CountsBackgroundModel */ public FrequencyBackgroundModel(CountsBackgroundModel cbg) { super(cbg, BackgroundModelLoader.FREQUENCY_TYPE_STRING); for (int i = 1; i <= cbg.getMaxKmerLen(); i++) { cbg.computeFrequencies(i); modelProbs[i].putAll(cbg.modelProbs[i]); } } /** * set the hasCounts fields of the metadata */ protected void init() { this.hasCounts = false; } /** * @see BackgroundModel */ public Set<String> getKmers(int kmerLen) { return modelProbs[kmerLen].keySet(); } /** * @see BackgroundModel * On the fly compute the markov probability from the known frequencies */ public double getMarkovProb(String kmer) { String prevBases = kmer.substring(0, kmer.length() - 1); double total = 0.0; for (char currBase : BackgroundModel.BASE_ORDER) { String currKmer = prevBases + currBase; total = total + this.getFrequency(currKmer); } if (total == 0.0) { return 0.0; } else { return (this.getFrequency(kmer) / total); } } /** * @see BackgroundModelFrequencySupport */ public double getFrequency(String kmer) { if (modelProbs[kmer.length()].containsKey(kmer)) { return (modelProbs[kmer.length()].get(kmer)); } else { return 0.0; } } /** * @see BackgroundModelFrequencySupport */ public double getFrequency(int intVal, int kmerLen) { return this.getFrequency(BackgroundModel.int2seq(intVal, kmerLen)); } /** * Set the model to have the frequency probabilities specified in the map * @param probs A map of kmers of identical length and their probabilities, * which must sum to 1. */ public void setKmerFrequencies(Map<String, Double> probs) { //check that the specified probabilties sum to 1 and are for kmers of the //correct length double total = 0.0; int kmerLen = probs.keySet().iterator().next().length(); if (kmerLen < 1) { throw new IllegalArgumentException("Kmer length must be >= 1."); } for (String kmer : probs.keySet()) { if (BackgroundModel.isKmerValid(kmer)) { if (kmer.length() == kmerLen) { total += probs.get(kmer); } else { throw new IllegalArgumentException("Kmers in map must all have the same length"); } } else { throw new IllegalArgumentException("Kmers must consist of one or more DNA bases, but is: " + kmer); } } //set the model to use the specified probabilities if (Fmath.isEqualWithinLimits(total, 1.0, BackgroundModel.EPSILON)) { modelProbs[kmerLen] = new HashMap<String, Double>(probs); } else { throw new IllegalArgumentException("Probabilities must sum to 1, but instead sum to " + total); } //reset the isStranded variable to null to indicate unknown strandedness isStranded = null; } /** * @see BackgroundModel * Check whether the reverse complement kmers have equal frequencies for all * the kmer lengths */ public boolean checkAndSetIsStranded() { int currKmerLen = 1; while (currKmerLen <= modelProbs.length) { List<Pair<Integer, Integer>> revCompPairs = BackgroundModel.computeDistinctRevCompPairs(currKmerLen); for (Pair<Integer, Integer> rcPair : revCompPairs) { if (!Fmath.isEqualWithinLimits(this.getFrequency(currKmerLen, rcPair.car()), this.getFrequency(currKmerLen,rcPair.cdr()), BackgroundModel.EPSILON)) { isStranded = true; return isStranded; } } } /** * If all kmerlens have been checked and the method hasn't returned then the * model is not stranded */ isStranded = false; return isStranded; } /** * @see BackgroundModelFrequencySupport * For a frequency based model this is accomplished by setting reverse * complements to have the average of their frequencies, so there is no harm * in running this method even if the model is already destranded. */ public void degenerateStrands() { for (int currKmerLen = 1; currKmerLen <= this.getMaxKmerLen(); currKmerLen++) { List<Pair<Integer, Integer>> revCompPairs = BackgroundModel.computeDistinctRevCompPairs(currKmerLen); for (Pair<Integer, Integer> rcPair : revCompPairs) { double freq = this.getFrequency(rcPair.car(), currKmerLen); double revCompFreq = this.getFrequency(rcPair.cdr(), currKmerLen); double newFreq = (freq + revCompFreq) / 2.0; modelProbs[currKmerLen].put(BackgroundModel.int2seq(rcPair.car(), currKmerLen), newFreq); modelProbs[currKmerLen].put(BackgroundModel.int2seq(rcPair.cdr(), currKmerLen), newFreq); } } isStranded = false; } /** * Check if this model is normalized properly. * @return the shortest kmerlen for which the model is not normalized, or -1 * if normalized properly. */ public int verifyNormalization() { //iterate over each order level of the model for (int i = 1; i <= this.getMaxKmerLen(); i++) { Double total = 0.0; //iterate over all the n-mers of the order summing up the values for (int k = 0; k < (int) Math.pow(4, i); k++) { String currMer = int2seq(k, i); total += modelProbs[i].get(currMer); } if (!Fmath.isEqualWithinLimits(total, 1.0, 1E-6)) { return i; } } return -1; } }