/* * Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br) * Copyright (C) 2008,2009 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com) * * For further information check the LICENSE file. */ package bio.pih.genoogle.encoder; import bio.pih.genoogle.index.ValueOutOfBoundsException; import bio.pih.genoogle.seq.Alphabet; import bio.pih.genoogle.seq.IllegalSymbolException; import bio.pih.genoogle.seq.LightweightSymbolList; import bio.pih.genoogle.seq.SymbolList; import bio.pih.genoogle.util.SymbolListWindowIterator; import bio.pih.genoogle.util.SymbolListWindowIteratorFactory; /** * Class with the main informations of the encoding sequences. * * @author albrecht */ public abstract class SequenceEncoder { private static int POSITION_LENGTH = 0; private static int POSITION_BEGIN_BITS_VECTOR = 1; private static int maximumAlphabetBitsSize = 8; protected final Alphabet alphabet; protected final int subSequenceLength; protected final int bitsByAlphabetSize; protected final int bitsMask; /** * @param alphabet * @param subSequenceLength */ public SequenceEncoder(Alphabet alphabet, int subSequenceLength) throws ValueOutOfBoundsException { this.alphabet = alphabet; this.subSequenceLength = subSequenceLength; this.bitsByAlphabetSize = bitsByAlphabetSize(alphabet.getSize()); this.bitsMask = ((1 << bitsByAlphabetSize) - 1); // Check if the sub sequence length and alphabet can be stored inside a 32 bits integer. if (this.bitsByAlphabetSize * this.subSequenceLength > 32) { throw new RuntimeException("The subsequence length ("+this.subSequenceLength+") is to long for this alphabet. Use a smaller subsequence length."); } } /** * @param alphabetSize * must be equal or higher than 1 and equals or lower than 256 * @return how many bits is necessary to store each character of the given alphabet size. */ public static int bitsByAlphabetSize(int alphabetSize) throws ValueOutOfBoundsException { if (alphabetSize <= 0) { throw new ValueOutOfBoundsException("alphabetSize lower than zero."); } if (alphabetSize > (1 << maximumAlphabetBitsSize)) { throw new ValueOutOfBoundsException("alphabetSize higher than " + (1 << maximumAlphabetBitsSize)); } int maxValue = alphabetSize - 1; if (maxValue == 0) { return 1; } int bits = maximumAlphabetBitsSize; while ((maxValue & (1 << bits)) == 0) { bits--; } // the "one" that was used above should be added. return bits + 1; } /** * @return Position in integer vector that is the information of the {@link SymbolList} length */ public final static int getPositionLength() { return POSITION_LENGTH; } /** * @return Position that the bit vector itself begin */ public final static int getPositionBeginBitsVector() { return POSITION_BEGIN_BITS_VECTOR; } /** * @param encodedSequence * @return length in bases of the encoded sequence. */ public final static int getSequenceLength(int[] encodedSequence) { return encodedSequence[POSITION_LENGTH]; } /** * @return the length of the subsequences. */ public int getSubSequenceLength() { return subSequenceLength; } public Alphabet getAlphabet() { return alphabet; } public abstract int getBitsFromChar(char symbol); public abstract char getSymbolFromBits(int bits); /** * Encode a subsequence of the encoder length to its int representation * * @param subSymbolList * @return an int containing the representation of the subsequence */ public int encodeSubSequenceToInteger(SymbolList subSymbolList) { if (subSymbolList.getLength() > subSequenceLength) { throw new ValueOutOfBoundsException(subSymbolList + " is bigger than subSequenceLength(" + subSequenceLength + ")"); } int encoded = 0; for (int i = 1; i <= subSymbolList.getLength(); i++) { encoded |= (getBitsFromChar(subSymbolList.symbolAt(i)) << ((subSequenceLength - i) * bitsByAlphabetSize)); } return encoded; } public int encodeSubSequenceToInteger(String subSequence) { if (subSequence.length() > subSequenceLength) { throw new ValueOutOfBoundsException(subSequence + " is bigger than subSequenceLength(" + subSequenceLength + ")"); } int encoded = 0; for (int i = 0; i < subSequence.length(); i++) { encoded |= (getBitsFromChar(subSequence.charAt(i)) << ((subSequenceLength - (i + 1)) * bitsByAlphabetSize)); } return encoded; } /** * Decode an int vector to its sequence string representation * * @param encoded * @return the sequence string */ public String decodeIntegerToString(int encoded) { return decodeIntegerToString(encoded, subSequenceLength); } /** * @param encoded * @return {@link LightweightSymbolList} of the given encoded sub-sequence. */ public SymbolList decodeIntegerToSymbolList(int encoded) throws IllegalSymbolException { String sequenceString = decodeIntegerToString(encoded, subSequenceLength); return new LightweightSymbolList(alphabet, sequenceString); } private String decodeIntegerToString(int encoded, int length) { return decodeIntegerToString(encoded, 0, length - 1); } /** * TODO: Optimize this function using a constant masks table. */ private String decodeIntegerToString(int encoded, int begin, int end) { StringBuilder sb = new StringBuilder((end - begin) + 1); for (int pos = begin; pos <= end; pos++) { int posInInt = subSequenceLength - pos; int shift = posInInt * bitsByAlphabetSize; int value = encoded >> (shift - bitsByAlphabetSize); sb.append(getSymbolFromBits(value & bitsMask)); } return sb.toString(); } /** * Encode a {@link SymbolList} of length 1 to (2^16)-1 to an array of int. * * @param sequence * @return an array of int as bit vector */ public int[] encodeSymbolListToIntegerArray(SymbolList sequence) { if (!sequence.getAlphabet().equals(alphabet)) { throw new RuntimeException("SymbolList alphabet ("+sequence.toString()+") is not the same from the encoder ("+alphabet+")"); } int size = sequence.getLength() / subSequenceLength; int extra = sequence.getLength() % subSequenceLength; if (extra != 0) { // extra space for incomplete sub-sequence size++; } size++; // extra space for information on the length. int sequenceEncoded[] = new int[size]; sequenceEncoded[getPositionLength()] = sequence.getLength(); if (sequence.getLength() < subSequenceLength) { sequenceEncoded[getPositionBeginBitsVector()] = encodeSubSequenceToInteger(sequence); } else { int pos = getPositionBeginBitsVector(); SymbolListWindowIterator symbolListWindowIterator = SymbolListWindowIteratorFactory.getNotOverlappedFactory().newSymbolListWindowIterator( sequence, this.subSequenceLength); while (symbolListWindowIterator.hasNext()) { SymbolList next = symbolListWindowIterator.next(); sequenceEncoded[pos] = encodeSubSequenceToInteger(next); pos++; } if (pos < size) { int from = sequence.getLength() - extra + 1; sequenceEncoded[pos] = encodeSubSequenceToInteger(sequence.subSymbolList(from, sequence.getLength())); } } return sequenceEncoded; } /** * @param encodedSequence * @return the {@link SymbolList} that is stored in encodedSequence */ public SymbolList decodeIntegerArrayToSymbolList(int[] encodedSequence) throws IllegalSymbolException { String sequenceString = decodeIntegerArrayToString(encodedSequence); return new LightweightSymbolList(alphabet, sequenceString); } /** * @param encodedSequence * @param begin * @param end * @return the sequence in {@link String} form that is stored in encodedSequence */ public String decodeIntegerArrayToString(int[] encodedSequence, int begin, int end) { if ((end - begin) + 1 < subSequenceLength) { return decoteIntegerArrayToStringShortenOneSubSequence(encodedSequence, begin, end); } StringBuilder sequence = new StringBuilder(); int arrayPos = (begin / subSequenceLength) + 1; int posInInt = begin % subSequenceLength; if (posInInt != 0) { sequence.append(decodeIntegerToString(encodedSequence[arrayPos], posInInt, subSequenceLength - 1)); arrayPos++; } int arrayPosLast = end / subSequenceLength; for (; arrayPos <= arrayPosLast; arrayPos++) { sequence.append(decodeIntegerToString(encodedSequence[arrayPos], subSequenceLength)); } int posInIntLast = end % subSequenceLength; if (posInIntLast > 0) { sequence.append(decodeIntegerToString(encodedSequence[arrayPos], 0, posInIntLast)); } return sequence.toString(); } private String decoteIntegerArrayToStringShortenOneSubSequence(int[] encodedSequence, int begin, int end) { int arrayPosBegin = (begin / subSequenceLength) + 1; int arrayPosEnd = (end / subSequenceLength) + 1; int firstInt = encodedSequence[arrayPosBegin]; if (arrayPosBegin == arrayPosEnd) { return decodeIntegerToString(firstInt, begin, end); } StringBuilder sequence = new StringBuilder(); int beginPos = begin % subSequenceLength; sequence.append(decodeIntegerToString(firstInt, beginPos, subSequenceLength - 1)); int endPos = end % subSequenceLength; sequence.append(decodeIntegerToString(encodedSequence[arrayPosEnd], 0, endPos)); return sequence.toString(); } /** * @param encodedSequence * @return the Sequence in String form encoded in encodedSequence. */ public String decodeIntegerArrayToString(int[] encodedSequence) { StringBuilder sequence = new StringBuilder(encodedSequence[getPositionLength()]); int extra = encodedSequence[getPositionLength()] % subSequenceLength; if (extra == 0) { for (int i = getPositionBeginBitsVector(); i < encodedSequence.length; i++) { sequence.append(decodeIntegerToString(encodedSequence[i], subSequenceLength)); } return sequence.toString(); } int i; for (i = getPositionBeginBitsVector(); i < encodedSequence.length - 1; i++) { sequence.append(decodeIntegerToString(encodedSequence[i], subSequenceLength)); } sequence.append(decodeIntegerToString(encodedSequence[i], extra)); return sequence.toString(); } // TODO: 1o. aplico a mask e depois faco o shift right, nao seria melhor fazer inverso? public int getValueAtPos(int[] encodedSequence, int pos, int subSequenceLength) { int posInArray = (pos / subSequenceLength) + 1; int posInInt = (subSequenceLength) - (pos % subSequenceLength); int vectorValue = encodedSequence[posInArray]; int shift = posInInt * this.bitsByAlphabetSize; int value = vectorValue >> (shift - this.bitsByAlphabetSize); return value & this.bitsMask; } }