/* * Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br) * Copyright (C) 2008,2009 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com) * * For further information check the LICENSE file. */ package bio.pih.genoogle.index; import java.util.Arrays; import java.util.List; import bio.pih.genoogle.encoder.DNASequenceEncoder; import bio.pih.genoogle.seq.DNAAlphabet; import com.google.common.collect.Lists; public class LowComplexitySubSequences { private final int bitsByAlphabetSize; private final int subSequenceLength; private final int bitsMask; private final int derivationLimit; private static final int alphabetSize = DNAAlphabet.SINGLETON.getSize(); public LowComplexitySubSequences(int subSequenceLength, int derivationLimit) { this.subSequenceLength = subSequenceLength; this.derivationLimit = derivationLimit; this.bitsByAlphabetSize = DNASequenceEncoder.bitsByAlphabetSize(alphabetSize); this.bitsMask = ((1 << bitsByAlphabetSize) - 1); } public int[] getSubSequences() { double total = 0.0; final int maxSize = (int) Math.pow(alphabetSize, subSequenceLength); for (int i = 0; i < maxSize; i++) { double sequenceStandartDerivation = sequenceStandartDerivation(i); total += sequenceStandartDerivation; } double varianceSumming = 0.0; final double m = total / maxSize; for (int i = 0; i < maxSize; i++) { double sequenceStandartDerivation = sequenceStandartDerivation(i); double d = sequenceStandartDerivation - m; varianceSumming += (d * d); } double variance = varianceSumming / maxSize; double standartDerivation = Math.sqrt(variance); List<Integer> lowComplexitySubSequences = Lists.newArrayList(); double limit = m + (standartDerivation * this.derivationLimit); for (int i = 0; i < maxSize; i++) { double sequenceStandartDerivation = sequenceStandartDerivation(i); if (sequenceStandartDerivation > limit) { lowComplexitySubSequences.add(i); } } int[] lowIndex = new int[lowComplexitySubSequences.size()]; for (int i = 0; i < lowComplexitySubSequences.size(); i++) { lowIndex[i] = lowComplexitySubSequences.get(i).intValue(); } Arrays.sort(lowIndex); return lowIndex; } private double sequenceStandartDerivation(int encoded) { int ac = 0; int cc = 0; int gc = 0; int tc = 0; for (int pos = 0; pos < subSequenceLength; pos++) { int posInInt = subSequenceLength - pos; int shift = posInInt * bitsByAlphabetSize; int value = encoded >> (shift - bitsByAlphabetSize); value &= bitsMask; if (value == 0) { ac++; } else if (value == 1) { cc++; } else if (value == 2) { gc++; } else if (value == 3) { tc++; } } final int total = subSequenceLength; final double m = total / 4.0; double dma = ac - m; double dmc = cc - m; double dmg = gc - m; double dmt = tc - m; double variance = ((dma * dma) + (dmc * dmc) + (dmg * dmg) + (dmt * dmt)) / 4.0; return Math.sqrt(variance); } }