/** * */ package org.seqcode.data.motifdb; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.seqcode.data.io.parsing.FASTAStream; import org.seqcode.genome.Genome; import org.seqcode.genome.location.NamedRegion; import org.seqcode.genome.location.Region; import org.seqcode.genome.sequence.SequenceGenerator; import org.seqcode.genome.sequence.SequenceUtils; import org.seqcode.gsebricks.verbs.location.ChromRegionIterator; import org.seqcode.gseutils.NotFoundException; import org.seqcode.gseutils.Pair; /** * @author rca * */ public class CountsBackgroundModel extends BackgroundModel implements BackgroundModelFrequencySupport { /** * Map for holding the kmer counts for the model. Each * element of the array holds kmers whose length is the index of that element. * i.e. model[2] holds 2-mers. Accordingly, model[0] should always be empty. * * Note: This may be null if count information is unavailable (as may be the * case for a frequency model or markov model) */ private Map<String, Long>[] kmerCounts; /** * Construct a new CountsBackgroundModel from the supplied metadata object * @param md the metadata describing this model * @throws NotFoundException if a Genome can't be found for the metadata's * genome ID */ public CountsBackgroundModel(BackgroundModelMetadata md) throws NotFoundException { super(md); if (!md.hasDBModelType()) { this.dbModelType = BackgroundModelLoader.FREQUENCY_TYPE_STRING; } } /** * Construct a new CountsBackground model with the specified name, for the * specified genome, and with a default max kmer length * @param name * @param gen */ public CountsBackgroundModel(String name, Genome gen) { super(name, gen, BackgroundModelLoader.FREQUENCY_TYPE_STRING); } /** * Construct a new CountsBackground model with the specified name, for the * specified genome, and with the specified max kmer length * @param name * @param gen * @param maxKmerLen */ public CountsBackgroundModel(String name, Genome gen, int maxKmerLen) { //Note: typically Count based models will go into the DB as frequency models //so use frequency as a model type super(name, gen, maxKmerLen, BackgroundModelLoader.FREQUENCY_TYPE_STRING); } /** * Initialize the kmer counts hash and set the hasCounts field of the metadata */ protected void init() { this.hasCounts = true; kmerCounts = new HashMap[maxKmerLen + 1]; for (int i = 1; i <= maxKmerLen; i++) { kmerCounts[i] = new HashMap<String, Long>(); } //set the model map array to be empty so that it can be used to lazily //shadow the counts with frequencies Arrays.fill(modelProbs, null); } /** * @see BackgroundModel */ public Set<String> getKmers(int kmerLen) { return kmerCounts[kmerLen].keySet(); } /** * @see BackgroundModel * On the fly compute the markov probability from the known frequencies */ public double getMarkovProb(String kmer) { String prevBases = kmer.substring(0, kmer.length() - 1); double total = 0.0; for (char currBase : BackgroundModel.BASE_ORDER) { String currKmer = prevBases + currBase; total = total + this.getFrequency(currKmer); } if (total == 0.0) { return 0.0; } else { return (this.getFrequency(kmer) / total); } } /** * @see BackgroundModelFrequencySupport */ public double getFrequency(String kmer) { //build the cache if it's not ready if (modelProbs[kmer.length()] == null) { this.computeFrequencies(kmer.length()); } if (modelProbs[kmer.length()].containsKey(kmer)) { return (modelProbs[kmer.length()].get(kmer)); } else { return 0.0; } } /** * @see BackgroundModelFrequencySupport */ public double getFrequency(int intVal, int kmerLen) { return this.getFrequency(BackgroundModel.int2seq(intVal, kmerLen)); } /** * * @param mer * @return */ public long getKmerCount(String kmer) { if (kmerCounts[kmer.length()].containsKey(kmer)) { return (kmerCounts[kmer.length()].get(kmer)); } else { return 0; } } /** * * @param kmerLen * @param intVal * @return */ public long getKmerCount(int intVal, int kmerLen) { return (this.getKmerCount(BackgroundModel.int2seq(intVal, kmerLen))); } public void setKmerCount(String kmer, long count) { if (BackgroundModel.isKmerValid(kmer)) { kmerCounts[kmer.length()].put(kmer, count); //clear the frequency map for the kmer's length modelProbs[kmer.length()] = null; //reset the isStranded variable to null to indicate unknown strandedness isStranded = null; } else { throw new IllegalArgumentException("Kmers must consist of one or more DNA bases, but is: " + kmer); } } public void addToKmerCount(String kmer, long addlCount) { kmerCounts[kmer.length()].put(kmer, this.getKmerCount(kmer) + addlCount); //clear the frequency map for the kmer's length modelProbs[kmer.length()] = null; //reset the isStranded variable to null to indicate unknown strandedness isStranded = null; } /** * @see addKmerCountsFromSequence(String sequence, boolean addRevComp) * @param sequence */ public void addKmerCountsFromSequence(String sequence) { this.addKmerCountsFromSequence(sequence, false); } /** * Examine all the appropriately sized kmers from the specified sequence and * add them to this model * @param sequence * @param addRevComp if true also add counts for the reverse complement */ public void addKmerCountsFromSequence(String sequence, boolean addRevComp) { //handle all the positions with complete length kmers String currSub = ""; int maxLen = this.getMaxKmerLen(); for (int i = 0; i <= sequence.length() - maxLen; i++) { currSub = sequence.substring(i, i + maxLen); for (int k = 1; k <= maxLen; k++) { String currKmer = currSub.substring(0, k); if (!currKmer.contains(" ") && !currKmer.contains("N")) { this.addToKmerCount(currKmer, 1); if (addRevComp) { this.addToKmerCount(SequenceUtils.reverseComplement(currKmer), 1); } } } } //Put in the last few kmers int start = sequence.length() - maxLen + 1; for (int i = start; i < sequence.length(); i++) { if(sequence.length()>1){ currSub = sequence.substring(i); for (int k = 1; k <= currSub.length(); k++) { String currKmer = currSub.substring(0, k); if (!currKmer.contains(" ") && !currKmer.contains("N")) { this.addToKmerCount(currKmer, 1); if (addRevComp) { this.addToKmerCount(SequenceUtils.reverseComplement(currKmer), 1); } } } } } isStranded = !addRevComp; //set the model map array to be empty so that it can be used to lazily //shadow the counts with frequencies Arrays.fill(modelProbs, null); } /** * @see BackgroundModel * Check whether the reverse complement kmers have equal counts for all * the kmer lengths */ public boolean checkAndSetIsStranded() { int currKmerLen = 1; while (currKmerLen <= modelProbs.length) { List<Pair<Integer, Integer>> revCompPairs = BackgroundModel.computeDistinctRevCompPairs(currKmerLen); for (Pair<Integer, Integer> rcPair : revCompPairs) { if (this.getKmerCount(rcPair.car(), currKmerLen) == this.getKmerCount(rcPair.cdr(), currKmerLen)) { isStranded = true; return isStranded; } } } /** * If all kmerlens have been checked and the method hasn't returned then the * model is not stranded */ isStranded = false; return isStranded; } /** * @see BackgroundModelFrequencySupport * For a count based model this is accomplished by setting reverse * complements to have the sum of their counts. */ public void degenerateStrands() { if (isStranded == null) { this.checkAndSetIsStranded(); } //only destrand the model if its stranded, otherwise the counts will blow up if (isStranded) { for (int currKmerLen = 1; currKmerLen <= this.getMaxKmerLen(); currKmerLen++) { List<Pair<Integer, Integer>> revCompPairs = BackgroundModel.computeDistinctRevCompPairs(currKmerLen); for (Pair<Integer, Integer> rcPair : revCompPairs) { long count = this.getKmerCount(rcPair.car(), currKmerLen);; long revCompCount = this.getKmerCount(rcPair.cdr(), currKmerLen);; long newCount = count + revCompCount; this.setKmerCount(BackgroundModel.int2seq(rcPair.car(), currKmerLen), newCount); this.setKmerCount(BackgroundModel.int2seq(rcPair.cdr(), currKmerLen), newCount); } } } //set the model map array to be empty so that it can be used to lazily //shadow the counts with frequencies Arrays.fill(modelProbs, null); } /** * Compute the frequencies corresponding to the counts for kmers of the * specified length * @param kmerLen */ protected void computeFrequencies(int kmerLen) { double total = 0; for (long count : kmerCounts[kmerLen].values()) { total += count; } modelProbs[kmerLen] = new HashMap<String, Double>(); if (total > 0) { for (String kmer : kmerCounts[kmerLen].keySet()) { modelProbs[kmerLen].put(kmer, ((double)this.getKmerCount(kmer) / total)); } } } /** * Create a model from the entirety of the specified genome * @param gen * @return */ public static CountsBackgroundModel modelFromWholeGenome(Genome gen, int k){ ArrayList <Region>chromList = new ArrayList<Region>(); Iterator<NamedRegion> chroms = new ChromRegionIterator(gen); while (chroms.hasNext()) { chromList.add(chroms.next()); } return CountsBackgroundModel.modelFromRegionList(gen, chromList, k); } public static CountsBackgroundModel modelFromWholeGenome(Genome gen){ return modelFromWholeGenome(gen, DEFAULT_MAX_KMER_LEN);} /** * Create a model from a list of regions from the specified genome * @param gen * @param regionList * @return */ public static CountsBackgroundModel modelFromRegionList(Genome gen, List<Region> regionList, int k){ SequenceGenerator<Region> seqgen = new SequenceGenerator<Region>(); seqgen.useCache(false); CountsBackgroundModel cbg = new CountsBackgroundModel(null, gen, k); cbg.gen = gen; for (Region currR : regionList) { String tmpSeq = seqgen.execute(currR); String regionSeq = tmpSeq.toUpperCase(); cbg.addKmerCountsFromSequence(regionSeq); } return cbg; } public static CountsBackgroundModel modelFromRegionList(Genome gen, List<Region> regionList) { return modelFromRegionList(gen, regionList, DEFAULT_MAX_KMER_LEN);} /** * Create a model from a list of sequences * @param gen * @param regionList * @return */ public static CountsBackgroundModel modelFromSeqList(Genome gen, List<String> seqList, int k) { CountsBackgroundModel cbg = new CountsBackgroundModel(null, gen, k); for (String curr : seqList) { String regionSeq = curr.toUpperCase(); cbg.addKmerCountsFromSequence(regionSeq); } return cbg; } public static CountsBackgroundModel modelFromSeqList(Genome gen, List<String> seqList) {return modelFromSeqList(gen, seqList,DEFAULT_MAX_KMER_LEN);} /** * Create a model from a FASTAStream object * @param stream * @return */ public static CountsBackgroundModel modelFromFASTAStream(FASTAStream stream, int k) { CountsBackgroundModel cbg = new CountsBackgroundModel(null, null, k); while (stream.hasNext()) { Pair<String, String> currSeq = stream.next(); String tmpSeq = currSeq.cdr(); String regionSeq = tmpSeq.toUpperCase(); cbg.addKmerCountsFromSequence(regionSeq); } stream.close(); return cbg; }public static CountsBackgroundModel modelFromFASTAStream(FASTAStream stream) { return modelFromFASTAStream(stream, DEFAULT_MAX_KMER_LEN);} }