/* * Copyright 2015 MiLaboratory.com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.milaboratory.core.sequence; import com.fasterxml.jackson.databind.annotation.JsonDeserialize; import com.fasterxml.jackson.databind.annotation.JsonSerialize; import com.milaboratory.primitivio.annotations.Serializable; import com.milaboratory.util.HashFunctions; import gnu.trove.impl.Constants; import gnu.trove.map.TLongObjectMap; import gnu.trove.map.hash.TCharByteHashMap; import gnu.trove.map.hash.TLongObjectHashMap; import java.io.ObjectStreamException; import java.util.Arrays; import java.util.Collections; import java.util.List; /** * Interface for sequence letters alphabet (amino acid, nucleotide, etc.). {@code Alphabet} is responsible for * conversion between char representation of letters (e.g. 'A', 'T', 'G', 'C' in case of * {@link com.milaboratory.core.sequence.NucleotideAlphabet}) and their internal byte representation. * * <p>Alphabet also responsible for storing information about symbols wildcards.</p> * * <p>All alphabets letters grouped in two sets: <b>pure letters</b> and <b>wildcards</b>. Pure letters has codes less * than {@link #basicSize()}, wildcards has codes greater or equals to {@link #basicSize()}.</p> * * <p>Implementation note: all alphabets should be singletons.</p> * * @param <S> corresponding type of sequence * @author Dmitriy Bolotin (bolotin.dmitriy@gmail.com) * @author Stanislav Poslavsky (stvlpos@mail.ru) * @author Mikhail Shugay (mikhail.shugay@gmail.com) * @see com.milaboratory.core.sequence.Sequence * @see com.milaboratory.core.sequence.SequenceBuilder * @see com.milaboratory.core.sequence.NucleotideAlphabet * @see com.milaboratory.core.sequence.NucleotideSequence */ @JsonSerialize(using = Alphabets.Serializer.class) @JsonDeserialize(using = Alphabets.Deserializer.class) @Serializable(by = IO.AlphabetSerializer.class) public abstract class Alphabet<S extends Sequence<S>> implements java.io.Serializable { /* ID */ /** * Alphabet id */ private final byte alphabetId; /** * Alphabet name */ private final String alphabetName; private final int hashCode; /* Content */ /** * Every code below this threshold represents definite letter, codes >= countOfPureLetters represents wildcards */ private final int countOfBasicLetters; /** * Code to char upper case symbol mapping */ private final char[] codeToSymbol; /** * Code to wildcard object mapping */ private final Wildcard[] codeToWildcard; /** * Unmodifiable list of wildcards */ private final List<Wildcard> wildcardsList; /** * Backward mapping for both cases */ private final TCharByteHashMap symbolToCode; /** * Wildcard for any letter (e.g. N for nucleotides, X for amino acids) */ private final Wildcard wildcardForAnyLetter; /** * Mapping between wildcard basicMask representation (bit representation) and wildcard object */ private final TLongObjectMap<Wildcard> basicMaskToWildcard; /** * Singleton empty sequence */ private volatile S empty; ///** // * 0b1111...11 = 2 ^ basicLettersCount - 1 // */ //final long basicLettersMask; Alphabet(String alphabetName, byte alphabetId, int countOfBasicLetters, Wildcard wildcardForAnyLetter, Wildcard... wildcards) { this.alphabetName = alphabetName; this.alphabetId = alphabetId; this.hashCode = HashFunctions.JenkinWang32shift(alphabetId); this.countOfBasicLetters = countOfBasicLetters; this.wildcardForAnyLetter = wildcardForAnyLetter; // Initialization of internal storage int size = wildcards.length; codeToSymbol = new char[size]; // For error checking (see below) Arrays.fill(codeToSymbol, (char) 0xFFFF); codeToWildcard = new Wildcard[size]; // -1 in constructor here is to simplify return of -1 for undefined symbols in symbolToCode symbolToCode = new TCharByteHashMap(Constants.DEFAULT_CAPACITY, Constants.DEFAULT_LOAD_FACTOR, (char) -1, (byte) -1); this.basicMaskToWildcard = new TLongObjectHashMap<>(); // Filling internal maps/arrays for (Wildcard wildcard : wildcards) { if (wildcard.isBasic() && wildcard.getCode() >= countOfBasicLetters) throw new IllegalArgumentException("Definite letter outside countOfPureLetters range."); if (codeToSymbol[wildcard.getCode()] != 0xFFFF) throw new IllegalArgumentException("Duplicate code."); codeToSymbol[wildcard.getCode()] = wildcard.getSymbol(); codeToWildcard[wildcard.getCode()] = wildcard; symbolToCode.put(wildcard.getSymbol(), wildcard.getCode()); symbolToCode.put(Character.toLowerCase(wildcard.getSymbol()), wildcard.getCode()); basicMaskToWildcard.put(wildcard.getBasicMask(), wildcard); } // Error checking for (int i = 0; i < codeToSymbol.length; i++) if (codeToSymbol[i] == 0xFFFF) throw new IllegalArgumentException("Symbol for code " + i + " is not set."); // To be returned by corresponding getter this.wildcardsList = Collections.unmodifiableList(Arrays.asList(codeToWildcard)); //this.basicLettersMask = ~(0xFFFFFFFFFFFFFFFFL << countOfBasicLetters); } /** * Gets number of letters in this alphabet including wildcard letters * * @return number of letters in this alphabet including wildcard letters */ public final int size() { return codeToSymbol.length; } /** * Gets number of letters in this alphabet without wildcard letters * * @return number of letters in this alphabet without wildcard letters */ public final int basicSize() { return countOfBasicLetters; } /** * Returns {@literal true} if this code represents wildcard symbol * * @param code code of letter * @return {@literal true} if this code represents wildcard symbol */ public final boolean isWildcard(byte code) { return code >= basicSize(); } /* Wildcard methods */ /** * Returns wildcard defined by specified code (letter). * * @param code code * @return wildcard defined by specified code (letter) */ public final Wildcard codeToWildcard(byte code) { return codeToWildcard[code]; } /** * Returns a wildcard object for specified letter. * * @param symbol symbol * @return wildcard object for specified letter */ public final Wildcard symbolToWildcard(char symbol) { return codeToWildcard[symbolToCode.get(symbol)]; } /** * Returns a collection of all wildcards defined for this. * * @return a collection of all wildcards defined for this. */ public final List<Wildcard> getAllWildcards() { return wildcardsList; } /** * Returns wildcard for any letter (e.g. N for nucleotides, X for amino acids). * * @return wildcard for any letter (e.g. N for nucleotides, X for amino acids) */ public final Wildcard getWildcardForAnyLetter() { return wildcardForAnyLetter; } /** * Converts wildcard basicMask to Wildcard object. * * @param basicMask bit represenatation of wildcard * @return wildcard object; {@literal null} if there is no such wildcard in the alphabet */ public Wildcard maskToWildcard(long basicMask) { return basicMaskToWildcard.get(basicMask); } /* Conversion */ /** * Gets a char symbol for an alphabet code of the letter * * @param code alphabet code of segment * @return char symbol for an alphabet code of the letter */ public final char codeToSymbol(byte code) { return codeToSymbol[code]; } /** * Gets the binary code representing given symbol (case insensitive) or -1 if there * is no such symbol in this alphabet * * @param symbol symbol to convert * @return binary code of the symbol (case insensitive) or -1 if there is no such symbol in the alphabet */ public byte symbolToCode(char symbol) { return symbolToCode.get(symbol); } /** * Gets the binary code corresponding to given symbol (case insensitive) or throws {@link IllegalArgumentException} * if there is no such symbol in this alphabet * * @param symbol symbol to convert * @return binary code of the symbol (case insensitive) * @throws IllegalArgumentException if there is no such symbol in the alphabet */ public final byte symbolToCodeWithException(char symbol) { byte b = symbolToCode(symbol); if (b == -1) throw new IllegalArgumentException("Unknown letter \'" + symbol + "\'"); return b; } /** * Returns a sequence builder for corresponding sequence type. * * @return sequence builder for corresponding sequence type */ public abstract SequenceBuilder<S> createBuilder(); /** * Returns empty sequence singleton * * @return empty sequence singleton */ public S getEmptySequence() { if (empty == null) synchronized (this) { if (empty == null) empty = createBuilder().createAndDestroy(); } return empty; } /** * Returns the human readable name of this alphabet. * * <p>This name can be then used to obtain the instance of this alphabet using {@link * com.milaboratory.core.sequence.Alphabets#getByName(String)} method if it is registered (see {@link * com.milaboratory.core.sequence.Alphabets#register(Alphabet)}).</p> */ public final String getAlphabetName() { return alphabetName; } /** * Returns byte id of this alphabet * * <p>This name can be then used to obtain the instance of this alphabet using {@link * com.milaboratory.core.sequence.Alphabets#getById(byte)} method if it is registered (see {@link * com.milaboratory.core.sequence.Alphabets#register(Alphabet)}).</p> */ public final byte getId() { return alphabetId; } /** * Parses string representation of sequence. * * @param string string representation of sequence * @return sequence */ public final S parse(String string) { SequenceBuilder<S> builder = createBuilder().ensureCapacity(string.length()); for (int i = 0; i < string.length(); ++i) { byte code = symbolToCode(string.charAt(i)); if (code == -1) throw new IllegalArgumentException("Letter \'" + string.charAt(i) + "\' is not defined in \'" + toString() + "\'."); builder.append(code); } return builder.createAndDestroy(); } /** * Convert alphabet to a readable string. * * @return alphabet as a readable string */ @Override public final String toString() { return "Alphabet{" + alphabetName + '}'; } /** * Returns "address in memory" (hash code as specified by {@link Object#hashCode()}. All Alphabet implementations * must be singletons. */ @Override public final int hashCode() { return hashCode; } /** * Checks that in is the same object (this points to the same address as {@code obj}) * All Alphabet implementations must be singletons. * * @param obj alphabet to check for equality with * @return {@literal true} if alphabets are the same */ @Override public final boolean equals(Object obj) { return obj == this; } /* Internal methods for Java Serialization */ protected Object writeReplace() throws ObjectStreamException { return new AlphabetSerialization(alphabetId); } protected static class AlphabetSerialization implements java.io.Serializable { final byte id; public AlphabetSerialization() { this.id = 0; } public AlphabetSerialization(byte id) { this.id = id; } private Object readResolve() throws ObjectStreamException { return Alphabets.getById(id); } } }