package org.seqcode.data.motifdb;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.seqcode.genome.Genome;
import org.seqcode.genome.sequence.SequenceUtils;
import org.seqcode.gseutils.NotFoundException;
import org.seqcode.gseutils.Pair;
import org.seqcode.math.stats.Fmath;
public abstract class BackgroundModel extends BackgroundModelMetadata {
public static final Pattern KMER_PATTERN = Pattern.compile("[ACGT]*");
public static final double EPSILON = 1E-6;
public static final char[] BASE_ORDER = new char[] { 'A', 'C', 'G', 'T' };
public static final int DEFAULT_MAX_KMER_LEN = 3;
// a reference to a genome object in addition to the metadata genomeID
protected Genome gen;
/**
* For keeping track of whether or not both strands are accounted for in the
* model. True if the model was based on a single strand False if the model
* was based on both strands
*/
protected Boolean isStranded = null;
/**
* Map for holding the kmer probability values for the model. Each element of
* the array holds kmers whose length is the index of that element. i.e.
* model[2] holds 2-mers. Accordingly, model[0] should always be empty.
*
* Note: this should only be used for probabilties, and not for counts. The
* CountBasedModel uses a separate Map to hold the counts and this map to keep
* the normalized counts (frequencies).
*/
protected Map<String, Double>[] modelProbs;
/**
* Construct a background model from the specified metadata object
* @param md the metadata describing this model
* @throws NotFoundException if a Genome can't be found for the metadata's
* genome ID
*/
public BackgroundModel(BackgroundModelMetadata md) throws NotFoundException {
super(md);
this.gen = Genome.findGenome(this.genomeID);
modelProbs = new HashMap[maxKmerLen + 1];
for (int i = 1; i <= maxKmerLen; i++) {
modelProbs[i] = new HashMap<String, Double>();
}
this.init();
}
/**
* Construct a model with the specified name, genome, and model type, and a
* default max kmer len
* @param name
* @param gen
* @param modelType
*/
public BackgroundModel(String name, Genome gen, String modelType) {
this(name, gen, DEFAULT_MAX_KMER_LEN, modelType);
}
/**
* Construct a model with the specified name, genome, and model type, and the
* specified max kmer len
* @param maxKmerLen
*/
public BackgroundModel(String name, Genome gen, int maxKmerLen, String modelType) {
super(gen==null ? 0 : gen.getDBID(), name, maxKmerLen, modelType);
this.gen = gen;
modelProbs = new HashMap[maxKmerLen + 1];
for (int i = 1; i <= maxKmerLen; i++) {
modelProbs[i] = new HashMap<String, Double>();
}
this.init();
}
/**
* Construct a Background Model from an existing Background Model. Set the
* instance variables of this class to match the source model. Subclasses
* should only set the dbid if the type of the subclass model matches the type
* of the source model.
*
* @param source
* the Background Model on which to base the one being constructed
*/
public BackgroundModel(BackgroundModel source, String dbModelType) {
this(source.name, source.gen, source.getMaxKmerLen(), dbModelType);
//Check this class's type and set the dbids if they match
if (this.getClass() == source.getClass()) {
this.mapID = source.mapID;
this.modelID = source.modelID;
}
this.isStranded = source.isStranded;
this.init();
}
/**
* A subclass specific init method to be called at the end of the constructor
* to handle tasks common among the constructors
*/
protected abstract void init();
/**
* overrides setHasCounts in BackgroundModelMetadata. It's not possible to set
* whether an instance of a background model has counts - it's implicit based
* on the subclass type.
*/
public void setHasCounts(boolean hasCounts) {
throw new UnsupportedOperationException("Can't modify hasCounts on Background Model instances");
}
/**
* Set the name of this model
*
* @param name
*/
public void setName(String name) {
if (name == null) {
throw new NullPointerException("Name can not be set to null");
}
else {
this.name = name;
}
}
/**
* Returns this model's genome
*
* @return the genome, which is null if it's not set
*/
public Genome getGenome() {
return gen;
}
/**
* Set this model to have the specified genome
*
* @param gen
*/
public void setGenome(Genome gen) {
this.gen = gen;
}
/**
* Get the Markov-order of this model
*
* @return
*/
public int getMarkovOrder() {
return modelProbs.length - 2;
}
/**
* Returns true if this model is Stranded (i.e. based on a single strand) or
* false if it is based on both strands
* @return
*/
public boolean isStranded() {
if (isStranded == null) {
this.checkAndSetIsStranded();
}
return isStranded;
}
/**
* Check whether this model is making use of both strands or just a single
* strand (i.e. check if all probabilities/counts match their reverse
* complement
*
* @return
*/
public abstract boolean checkAndSetIsStranded();
/**
* Return the set of kmers of the specified length for which this model has
* probabilities or counts
*
* @param kmerLen
* @return
*/
public abstract Set<String> getKmers(int kmerLen);
/**
* Return the markov probability for the last base of the specified kmer
* conditioned upon the preceding bases.
*
* @param kmer
* @return
*/
public abstract double getMarkovProb(String kmer);
/**
* Return the markov probability for the last base of the kmer represented by
* the specified int, conditioned upon the preceding bases.
*
* @param intVal
* @param kmerLen
* @return
*/
public double getMarkovProb(int intVal, int kmerLen) {
return (this.getMarkovProb(BackgroundModel.int2seq(intVal, kmerLen)));
}
/**
* Check if this background model is equal to the other background model in
* terms of the kmer values. The class must be the same so they can be compared
* properly.
* @param other the model to check
* @return true if the values are the same
*/
public boolean equalValues(BackgroundModel other) {
if (this.getClass().equals(other.getClass())) {
int maxKmerLen = Math.max(this.maxKmerLen, other.getMaxKmerLen());
// check kmers values from longest to shortest
for (int k = maxKmerLen; k > 0; k--) {
// assemble a set of all kmers in either model
HashSet<String> kmers = new HashSet<String>();
if (k <= this.maxKmerLen) {
kmers.addAll(this.getKmers(k));
}
if (k <= other.getMaxKmerLen()) {
kmers.addAll(other.getKmers(k));
}
// check all the kmers values as appropriate for the type of model
for (String kmer : kmers) {
if (this instanceof CountsBackgroundModel) {
long thisCount = 0;
long otherCount = 0;
if (k <= this.maxKmerLen) {
thisCount = ((CountsBackgroundModel) this).getKmerCount(kmer);
}
if (k <= other.getMaxKmerLen()) {
otherCount = ((CountsBackgroundModel) this).getKmerCount(kmer);
}
if (thisCount != otherCount) {
return false;
}
}
else {
double thisProb = 0;
double otherProb = 0;
if (this instanceof MarkovBackgroundModel) {
if (k <= this.maxKmerLen) {
thisProb = this.getMarkovProb(kmer);
}
if (k <= other.getMaxKmerLen()) {
otherProb = other.getMarkovProb(kmer);
}
}
else {
// check frequencies
if (k <= this.maxKmerLen) {
thisProb = ((FrequencyBackgroundModel) this).getFrequency(kmer);
}
if (k <= other.getMaxKmerLen()) {
otherProb = ((FrequencyBackgroundModel) other).getFrequency(kmer);
}
}
if (!Fmath.isEqualWithinLimits(thisProb, otherProb, EPSILON)) {
return false;
}
}
}
}
// if kmers values are equal for all lengths return true
return true;
}
else {
return false;
}
}
/**
* Convert a base to an int value
*
* @param base
* @return
*/
public static int base2int(char base) {
int intVal = -1;
switch (base) {
case 'A':
intVal = 0;
break;
case 'C':
intVal = 1;
break;
case 'G':
intVal = 2;
break;
case 'T':
intVal = 3;
break;
default:
throw new IllegalArgumentException("Invalid character: " + base);
}
return intVal;
}
/**
* Return a base for the specified integer
*
* @param x
* @return
*/
public static char int2base(int x) {
char base;
switch (x) {
case 0:
base = 'A';
break;
case 1:
base = 'C';
break;
case 2:
base = 'G';
break;
case 3:
base = 'T';
break;
default:
throw new IllegalArgumentException("Invalid int: " + x);
}
return (base);
}
/**
* Convert a nucleotide sequence to an integer value
*
* @param seq
* @return
*/
public static int seq2int(String seq) {
int intVal = 0;
int len = seq.length();
for (int i = 0; i < len; i++) {
long currInt = BackgroundModel.base2int(seq.charAt(i));
if (currInt == -1) {
return -1;
}
intVal = intVal << 2;
intVal += currInt;
}
return intVal;
}
/**
*
* @param x
* @return
*/
public static String int2seq(long x, int kmerLen) {
/**
* check that the x is valid for the specified maxKmerLen. Note: 4 << (2 *
* (kmerLen - 1)) = 4^kmerLen
*/
if (x > ((4 << (2 * (kmerLen - 1))) - 1)) {
throw new IllegalArgumentException("Invalid int value, " + x + ", for kmerLen " + kmerLen);
}
StringBuffer seq = new StringBuffer(kmerLen);
for (int i = 0; i < kmerLen; i++) {
int baseVal = (int) (x % 4);
seq.append(BackgroundModel.int2base(baseVal));
x = x >> 2;
}
return seq.reverse().toString();
}
/**
* Assemble a list of distinct pairs of kmers and non-palindrome reverse
* complements.
*
* @param kmerLen
* @return
*/
protected static List<Pair<Integer, Integer>> computeDistinctRevCompPairs(int kmerLen) {
int numKmers = (int) Math.pow(4, kmerLen);
int[] revCompMap = new int[numKmers];
Arrays.fill(revCompMap, -1);
ArrayList<Pair<Integer, Integer>> revCompPairs = new ArrayList<Pair<Integer, Integer>>();
for (int i = 0; i < revCompMap.length; i++) {
if (revCompMap[i] == -1) {
int revComp = BackgroundModel.seq2int(SequenceUtils.reverseComplement(BackgroundModel.int2seq(i, kmerLen)));
revCompMap[i] = revComp;
if (i != revComp) {
revCompMap[revComp] = i;
revCompPairs.add(new Pair<Integer, Integer>(i, revComp));
}
}
}
return revCompPairs;
}
/**
* Return true if the specified kmer only of (zero or more) DNA letters (ACGT)
* @param kmer
* @return
*/
protected static boolean isKmerValid(String kmer) {
Matcher matcher = KMER_PATTERN.matcher(kmer);
return matcher.matches();
}
/**
***********************************************************************
*/
public static void main(String[] args) {
for (int i = 1; i <= 3; i++) {
for (int j = 0; j < Math.pow(4, i); j++) {
String seq = BackgroundModel.int2seq(j, i);
System.out.println(j + "\t" + seq + "\t" + BackgroundModel.seq2int(seq) + " " + isKmerValid(seq));
assert (j == BackgroundModel.seq2int(seq));
}
}
System.out.println(isKmerValid("GN"));
}
}