/*
* Copyright 2015 MiLaboratory.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.milaboratory.core.sequence;
import com.milaboratory.util.Bit2Array;
import com.milaboratory.util.HashFunctions;
import java.util.HashSet;
import java.util.Set;
/**
* Utility methods for sequences.
*
* @author Bolotin Dmitriy (bolotin.dmitriy@gmail.com)
* @author Shugay Mikhail (mikhail.shugay@gmail.com)
*/
public final class SequencesUtils {
/**
* Check if a sequence contains letters only from specified alphabet. So in can be converted to corresponding type
* of sequence.
*
* @param alphabet alphabet
* @param string string to check
* @return {@literal true} if sequence belongs to alphabet, {@literal false} if does not
*/
public static boolean belongsToAlphabet(Alphabet<?> alphabet, String string) {
for (int i = 0; i < string.length(); ++i)
if (alphabet.symbolToCode(string.charAt(i)) == -1)
return false;
return true;
}
/**
* Returns a set of possible alphabets for a given string.
*
* <p>Looks for alphabets registered in {@link com.milaboratory.core.sequence.Alphabets}.</p>
*
* @param string target string (sequence)
* @return set of possible alphabets for a given string
*/
public static Set<Alphabet<?>> possibleAlphabets(String string) {
HashSet<Alphabet<?>> alphabets = new HashSet<>();
for (Alphabet alphabet : Alphabets.getAll()) {
if (belongsToAlphabet(alphabet, string))
alphabets.add(alphabet);
}
return alphabets;
}
/**
* Calculates number of mismatches (comparing position by position) between two regions of one or two different
* sequences.
*
* @param seq0 first sequence
* @param seq0Offset first letter of second region in first sequence
* @param seq1 second sequence (may be the same as {@code seq0}
* @param seq1Offset first letter of second region in second sequence
* @param length length of both regions
* @param <S> type of sequence
* @return number of mismatches
* @throws java.lang.IllegalArgumentException if one of regions is outside of target sequence
*/
public static <S extends Sequence<S>> int mismatchCount(S seq0, int seq0Offset, S seq1, int seq1Offset, int length) {
if (seq0.size() < seq0Offset + length || seq1.size() < seq1Offset + length)
throw new IllegalArgumentException();
int mm = 0;
for (int i = 0; i < length; ++i)
if (seq0.codeAt(i + seq0Offset) != seq1.codeAt(i + seq1Offset))
++mm;
return mm;
}
/**
* Returns a concatenation of several sequences.
*
* @param sequences array of sequences
* @param <S> type of sequences
* @return concatenation of several sequences
*/
public static <S extends Seq<S>> S concatenate(S... sequences) {
if (sequences.length == 0)
throw new IllegalArgumentException("Zero arguments");
if (sequences.length == 1)
return sequences[0];
int size = 0;
for (S s : sequences)
size += s.size();
SeqBuilder<S> builder = sequences[0].getBuilder().ensureCapacity(size);
for (S s : sequences)
builder.append(s);
return builder.createAndDestroy();
}
/**
* Converts sequence with wildcards to a sequence without wildcards by converting wildcard letters to uniformly
* distributed letters from the set of letters allowed by the wildcard. (see {@link
* Wildcard#getUniformlyDistributedBasicCode(long)}.
*
* <p>Returns same result for the same combination of sequence and seed.</p>
*
* @param sequence sequence to convert
* @param seed seed for random generator
* @param <S> type of sequence
* @return sequence with wildcards replaced by uniformly distributed random basic letters
*/
public static <S extends Sequence<S>> S wildcardsToRandomBasic(S sequence, long seed) {
Alphabet<S> alphabet = sequence.getAlphabet();
SequenceBuilder<S> sequenceBuilder = alphabet.createBuilder().ensureCapacity(sequence.size());
for (int i = 0; i < sequence.size(); ++i) {
byte code = sequence.codeAt(i);
if (alphabet.isWildcard(code)) {
seed = HashFunctions.JenkinWang64shift(seed + i);
sequenceBuilder.append(alphabet.codeToWildcard(code).getUniformlyDistributedBasicCode(seed));
} else
sequenceBuilder.append(code);
}
return sequenceBuilder.createAndDestroy();
}
/**
* Used to write legacy file formats.
*
* @return Bit2Array representation of nucleotide sequence
*/
public static Bit2Array convertNSequenceToBit2Array(NucleotideSequence seq) {
if (seq.containWildcards())
throw new IllegalArgumentException("Sequences with wildcards are not supported.");
Bit2Array bar = new Bit2Array(seq.size());
for (int i = 0; i < seq.size(); i++)
bar.set(i, seq.codeAt(i));
return bar;
}
/**
* Used to read legacy file formats.
*
* @return NucleotideSequence constructed from Bit2Array
*/
public static NucleotideSequence convertBit2ArrayToNSequence(Bit2Array bar) {
SequenceBuilder<NucleotideSequence> seq = NucleotideSequence.ALPHABET.createBuilder().ensureCapacity(bar.size());
for (int i = 0; i < bar.size(); i++)
seq.append((byte) bar.get(i));
return seq.createAndDestroy();
}
}