/* * Copyright (c) 2012 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package htsjdk.variant.variantcontext; import htsjdk.samtools.util.StringUtil; import java.util.Arrays; import java.util.Collection; /** * Immutable representation of an allele * * Types of alleles: * * Ref: a t C g a // C is the reference base * * : a t G g a // C base is a G in some individuals * * : a t - g a // C base is deleted w.r.t. the reference * * : a t CAg a // A base is inserted w.r.t. the reference sequence * * In these cases, where are the alleles? * * SNP polymorphism of C/G -> { C , G } -> C is the reference allele * 1 base deletion of C -> { tC , t } -> C is the reference allele and we include the preceding reference base (null alleles are not allowed) * 1 base insertion of A -> { C ; CA } -> C is the reference allele (because null alleles are not allowed) * * Suppose I see a the following in the population: * * Ref: a t C g a // C is the reference base * : a t G g a // C base is a G in some individuals * : a t - g a // C base is deleted w.r.t. the reference * * How do I represent this? There are three segregating alleles: * * { C , G , - } * * and these are represented as: * * { tC, tG, t } * * Now suppose I have this more complex example: * * Ref: a t C g a // C is the reference base * : a t - g a * : a t - - a * : a t CAg a * * There are actually four segregating alleles: * * { Cg , -g, --, and CAg } over bases 2-4 * * represented as: * * { tCg, tg, t, tCAg } * * Critically, it should be possible to apply an allele to a reference sequence to create the * correct haplotype sequence: * * Allele + reference => haplotype * * For convenience, we are going to create Alleles where the GenomeLoc of the allele is stored outside of the * Allele object itself. So there's an idea of an A/C polymorphism independent of it's surrounding context. * * Given list of alleles it's possible to determine the "type" of the variation * * A / C @ loc => SNP * - / A => INDEL * * If you know where allele is the reference, you can determine whether the variant is an insertion or deletion. * * Alelle also supports is concept of a NO_CALL allele. This Allele represents a haplotype that couldn't be * determined. This is usually represented by a '.' allele. * * Note that Alleles store all bases as bytes, in **UPPER CASE**. So 'atc' == 'ATC' from the perspective of an * Allele. * @author ebanks, depristo */ public class Allele implements Comparable<Allele> { private static final byte[] EMPTY_ALLELE_BASES = new byte[0]; private boolean isRef = false; private boolean isNoCall = false; private boolean isSymbolic = false; private byte[] bases = null; public final static String NO_CALL_STRING = "."; /** A generic static NO_CALL allele for use */ // no public way to create an allele protected Allele(byte[] bases, boolean isRef) { // null alleles are no longer allowed if ( wouldBeNullAllele(bases) ) { throw new IllegalArgumentException("Null alleles are not supported"); } // no-calls are represented as no bases if ( wouldBeNoCallAllele(bases) ) { this.bases = EMPTY_ALLELE_BASES; isNoCall = true; if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele"); return; } if ( wouldBeSymbolicAllele(bases) ) { isSymbolic = true; if ( isRef ) throw new IllegalArgumentException("Cannot tag a symbolic allele as the reference allele"); } else { StringUtil.toUpperCase(bases); } this.isRef = isRef; this.bases = bases; if ( ! acceptableAlleleBases(bases) ) throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'"); } protected Allele(String bases, boolean isRef) { this(bases.getBytes(), isRef); } /** * Creates a new allele based on the provided one. Ref state will be copied unless ignoreRefState is true * (in which case the returned allele will be non-Ref). * * This method is efficient because it can skip the validation of the bases (since the original allele was already validated) * * @param allele the allele from which to copy the bases * @param ignoreRefState should we ignore the reference state of the input allele and use the default ref state? */ protected Allele(final Allele allele, final boolean ignoreRefState) { this.bases = allele.bases; this.isRef = ignoreRefState ? false : allele.isRef; this.isNoCall = allele.isNoCall; this.isSymbolic = allele.isSymbolic; } private final static Allele REF_A = new Allele("A", true); private final static Allele ALT_A = new Allele("A", false); private final static Allele REF_C = new Allele("C", true); private final static Allele ALT_C = new Allele("C", false); private final static Allele REF_G = new Allele("G", true); private final static Allele ALT_G = new Allele("G", false); private final static Allele REF_T = new Allele("T", true); private final static Allele ALT_T = new Allele("T", false); private final static Allele REF_N = new Allele("N", true); private final static Allele ALT_N = new Allele("N", false); public final static Allele NO_CALL = new Allele(NO_CALL_STRING, false); // --------------------------------------------------------------------------------------------------------- // // creation routines // // --------------------------------------------------------------------------------------------------------- /** * Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases * == '-', a Null allele is created. If bases == '.', a no call Allele is created. * * @param bases the DNA sequence of this variation, '-', of '.' * @param isRef should we make this a reference allele? * @throws IllegalArgumentException if bases contains illegal characters or is otherwise malformated */ public static Allele create(byte[] bases, boolean isRef) { if ( bases == null ) throw new IllegalArgumentException("create: the Allele base string cannot be null; use new Allele() or new Allele(\"\") to create a Null allele"); if ( bases.length == 1 ) { // optimization to return a static constant Allele for each single base object switch (bases[0]) { case '.': if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele"); return NO_CALL; case 'A': case 'a' : return isRef ? REF_A : ALT_A; case 'C': case 'c' : return isRef ? REF_C : ALT_C; case 'G': case 'g' : return isRef ? REF_G : ALT_G; case 'T': case 't' : return isRef ? REF_T : ALT_T; case 'N': case 'n' : return isRef ? REF_N : ALT_N; default: throw new IllegalArgumentException("Illegal base [" + (char)bases[0] + "] seen in the allele"); } } else { return new Allele(bases, isRef); } } public static Allele create(byte base, boolean isRef) { return create( new byte[]{ base }, isRef); } public static Allele create(byte base) { return create( base, false ); } public static Allele extend(Allele left, byte[] right) { if (left.isSymbolic()) throw new IllegalArgumentException("Cannot extend a symbolic allele"); byte[] bases = new byte[left.length() + right.length]; System.arraycopy(left.getBases(), 0, bases, 0, left.length()); System.arraycopy(right, 0, bases, left.length(), right.length); return create(bases, left.isReference()); } /** * @param bases bases representing an allele * @return true if the bases represent the null allele */ public static boolean wouldBeNullAllele(byte[] bases) { return (bases.length == 1 && bases[0] == '-') || bases.length == 0; } /** * @param bases bases representing an allele * @return true if the bases represent the NO_CALL allele */ public static boolean wouldBeNoCallAllele(byte[] bases) { return bases.length == 1 && bases[0] == '.'; } /** * @param bases bases representing an allele * @return true if the bases represent a symbolic allele */ public static boolean wouldBeSymbolicAllele(byte[] bases) { if ( bases.length <= 1 ) return false; else { final String strBases = new String(bases); return (bases[0] == '<' || bases[bases.length-1] == '>') || // symbolic or large insertion (bases[0] == '.' || bases[bases.length-1] == '.') || // single breakend (strBases.contains("[") || strBases.contains("]")); // mated breakend } } /** * @param bases bases representing an allele * @return true if the bases represent the well formatted allele */ public static boolean acceptableAlleleBases(String bases) { return acceptableAlleleBases(bases.getBytes(), true); } public static boolean acceptableAlleleBases(String bases, boolean allowNsAsAcceptable) { return acceptableAlleleBases(bases.getBytes(), allowNsAsAcceptable); } /** * @param bases bases representing an allele * @return true if the bases represent the well formatted allele */ public static boolean acceptableAlleleBases(byte[] bases) { return acceptableAlleleBases(bases, true); // default: N bases are acceptable } public static boolean acceptableAlleleBases(byte[] bases, boolean allowNsAsAcceptable) { if ( wouldBeNullAllele(bases) ) return false; if ( wouldBeNoCallAllele(bases) || wouldBeSymbolicAllele(bases) ) return true; for (byte base : bases ) { switch (base) { case 'A': case 'C': case 'G': case 'T': case 'a': case 'c': case 'g': case 't': break; case 'N' : case 'n' : if (allowNsAsAcceptable) break; else return false; default: return false; } } return true; } /** * @see Allele(byte[], boolean) * * @param bases bases representing an allele * @param isRef is this the reference allele? */ public static Allele create(String bases, boolean isRef) { return create(bases.getBytes(), isRef); } /** * Creates a non-Ref allele. @see Allele(byte[], boolean) for full information * * @param bases bases representing an allele */ public static Allele create(String bases) { return create(bases, false); } /** * Creates a non-Ref allele. @see Allele(byte[], boolean) for full information * * @param bases bases representing an allele */ public static Allele create(byte[] bases) { return create(bases, false); } /** * Creates a new allele based on the provided one. Ref state will be copied unless ignoreRefState is true * (in which case the returned allele will be non-Ref). * * This method is efficient because it can skip the validation of the bases (since the original allele was already validated) * * @param allele the allele from which to copy the bases * @param ignoreRefState should we ignore the reference state of the input allele and use the default ref state? */ public static Allele create(final Allele allele, final boolean ignoreRefState) { return new Allele(allele, ignoreRefState); } // --------------------------------------------------------------------------------------------------------- // // accessor routines // // --------------------------------------------------------------------------------------------------------- // Returns true if this is the NO_CALL allele public boolean isNoCall() { return isNoCall; } // Returns true if this is not the NO_CALL allele public boolean isCalled() { return ! isNoCall(); } // Returns true if this Allele is the reference allele public boolean isReference() { return isRef; } // Returns true if this Allele is not the reference allele public boolean isNonReference() { return ! isReference(); } // Returns true if this Allele is symbolic (i.e. no well-defined base sequence) public boolean isSymbolic() { return isSymbolic; } // Returns a nice string representation of this object public String toString() { return ( isNoCall() ? NO_CALL_STRING : getDisplayString() ) + (isReference() ? "*" : ""); } /** * Return the DNA bases segregating in this allele. Note this isn't reference polarized, * so the Null allele is represented by a vector of length 0 * * @return the segregating bases */ public byte[] getBases() { return isSymbolic ? EMPTY_ALLELE_BASES : bases; } /** * Return the DNA bases segregating in this allele in String format. * This is useful, because toString() adds a '*' to reference alleles and getBases() returns garbage when you call toString() on it. * * @return the segregating bases */ public String getBaseString() { return isNoCall() ? NO_CALL_STRING : new String(getBases()); } /** * Return the printed representation of this allele. * Same as getBaseString(), except for symbolic alleles. * For symbolic alleles, the base string is empty while the display string contains <TAG>. * * @return the allele string representation */ public String getDisplayString() { return new String(bases); } /** * Same as #getDisplayString() but returns the result as byte[]. * * Slightly faster then getDisplayString() * * @return the allele string representation */ public byte[] getDisplayBases() { return bases; } /** * @param other the other allele * * @return true if these alleles are equal */ public boolean equals(Object other) { return ( ! (other instanceof Allele) ? false : equals((Allele)other, false) ); } /** * @return hash code */ public int hashCode() { int hash = 1; for (int i = 0; i < bases.length; i++) hash += (i+1) * bases[i]; return hash; } /** * Returns true if this and other are equal. If ignoreRefState is true, then doesn't require both alleles has the * same ref tag * * @param other allele to compare to * @param ignoreRefState if true, ignore ref state in comparison * @return true if this and other are equal */ public boolean equals(Allele other, boolean ignoreRefState) { return this == other || (isRef == other.isRef || ignoreRefState) && isNoCall == other.isNoCall && (bases == other.bases || Arrays.equals(bases, other.bases)); } /** * @param test bases to test against * * @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles */ public boolean basesMatch(byte[] test) { return !isSymbolic && (bases == test || Arrays.equals(bases, test)); } /** * @param test bases to test against * * @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles */ public boolean basesMatch(String test) { return basesMatch(test.toUpperCase().getBytes()); } /** * @param test allele to test against * * @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles */ public boolean basesMatch(Allele test) { return basesMatch(test.getBases()); } /** * @return the length of this allele. Null and NO_CALL alleles have 0 length. */ public int length() { return isSymbolic ? 0 : bases.length; } // --------------------------------------------------------------------------------------------------------- // // useful static functions // // --------------------------------------------------------------------------------------------------------- public static Allele getMatchingAllele(Collection<Allele> allAlleles, byte[] alleleBases) { for ( Allele a : allAlleles ) { if ( a.basesMatch(alleleBases) ) { return a; } } if ( wouldBeNoCallAllele(alleleBases) ) return NO_CALL; else return null; // couldn't find anything } public int compareTo(Allele other) { if ( isReference() && other.isNonReference() ) return -1; else if ( isNonReference() && other.isReference() ) return 1; else return getBaseString().compareTo(other.getBaseString()); // todo -- potential performance issue } public static boolean oneIsPrefixOfOther(Allele a1, Allele a2) { if ( a2.length() >= a1.length() ) return firstIsPrefixOfSecond(a1, a2); else return firstIsPrefixOfSecond(a2, a1); } private static boolean firstIsPrefixOfSecond(Allele a1, Allele a2) { String a1String = a1.getBaseString(); return a2.getBaseString().substring(0, a1String.length()).equals(a1String); } }