/* * Copyright (c) 2012 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package htsjdk.variant.bcf2; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; /** * An efficient scheme for building and obtaining specialized * genotype field decoders. Used by the BCFCodec to parse * with little overhead the fields from BCF2 encoded genotype * records * * @author Mark DePristo * @since 6/12 */ public class BCF2GenotypeFieldDecoders { private final static boolean ENABLE_FASTPATH_GT = true; private final static int MIN_SAMPLES_FOR_FASTPATH_GENOTYPES = 0; // TODO -- update to reasonable number // initialized once per writer to allow parallel writers to work private final HashMap<String, Decoder> genotypeFieldDecoder = new HashMap<String, Decoder>(); private final Decoder defaultDecoder = new GenericDecoder(); public BCF2GenotypeFieldDecoders(final VCFHeader header) { // TODO -- fill in appropriate decoders for each FORMAT field in the header genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder()); // currently the generic decoder handles FILTER values properly, in so far as we don't tolerate multiple filter field values per genotype genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FTDecoder()); genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder()); genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder()); genotypeFieldDecoder.put(VCFConstants.GENOTYPE_PL_KEY, new PLDecoder()); genotypeFieldDecoder.put(VCFConstants.GENOTYPE_QUALITY_KEY, new GQDecoder()); } // ----------------------------------------------------------------- // // Genotype field decoder // // ----------------------------------------------------------------- /** * Return decoder appropriate for field, or the generic decoder if no * specialized one is bound * @param field the GT field to decode * @return a non-null decoder */ public Decoder getDecoder(final String field) { final Decoder d = genotypeFieldDecoder.get(field); return d == null ? defaultDecoder : d; } /** * Decoder a field (implicit from creation) encoded as * typeDescriptor in the decoder object in the GenotypeBuilders * one for each sample in order. * * The way this works is that this decode method * iterates over the builders, decoding a genotype field * in BCF2 for each sample from decoder. * * This system allows us to easily use specialized * decoders for specific genotype field values. For example, * we use a special decoder to directly read the BCF2 data for * the PL field into a int[] rather than the generic List of Integer */ public interface Decoder { public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException; } private class GTDecoder implements Decoder { @Override public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && numElements == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES ) fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs); else { generalDecode(siteAlleles, numElements, decoder, typeDescriptor, gbs); } } /** * fast path for many samples with diploid genotypes * * The way this would work is simple. Create a List<Allele> diploidGenotypes[] object * After decoding the offset, if that sample is diploid compute the * offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1 * if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype * cache it and use that * * Some notes. If there are nAlleles at the site, there are implicitly actually * n + 1 options including */ @SuppressWarnings({"unchecked"}) private final void fastBiallelicDiploidDecode(final List<Allele> siteAlleles, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) throws IOException { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); final int nPossibleGenotypes = 3 * 3; final Object allGenotypes[] = new Object[nPossibleGenotypes]; for ( final GenotypeBuilder gb : gbs ) { final int a1 = decoder.decodeInt(type); final int a2 = decoder.decodeInt(type); if ( a1 == type.getMissingBytes() ) { assert a2 == type.getMissingBytes(); // no called sample GT = . gb.alleles(null); } else if ( a2 == type.getMissingBytes() ) { gb.alleles(Arrays.asList(getAlleleFromEncoded(siteAlleles, a1))); } else { // downshift to remove phase final int offset = (a1 >> 1) * 3 + (a2 >> 1); assert offset < allGenotypes.length; // TODO -- how can I get rid of this cast? List<Allele> gt = (List<Allele>)allGenotypes[offset]; if ( gt == null ) { final Allele allele1 = getAlleleFromEncoded(siteAlleles, a1); final Allele allele2 = getAlleleFromEncoded(siteAlleles, a2); gt = Arrays.asList(allele1, allele2); allGenotypes[offset] = gt; } gb.alleles(gt); } final boolean phased = (a1 & 0x01) == 1; gb.phased(phased); } } private final void generalDecode(final List<Allele> siteAlleles, final int ploidy, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) throws IOException { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); // a single cache for the encoded genotypes, since we don't actually need this vector final int[] tmp = new int[ploidy]; for ( final GenotypeBuilder gb : gbs ) { final int[] encoded = decoder.decodeIntArray(ploidy, type, tmp); if ( encoded == null ) // no called sample GT = . gb.alleles(null); else { assert encoded.length > 0; // we have at least some alleles to decode final List<Allele> gt = new ArrayList<Allele>(encoded.length); // note that the auto-pruning of fields magically handles different // ploidy per sample at a site for ( final int encode : encoded ) gt.add(getAlleleFromEncoded(siteAlleles, encode)); gb.alleles(gt); final boolean phased = (encoded[0] & 0x01) == 1; gb.phased(phased); } } } private final Allele getAlleleFromEncoded(final List<Allele> siteAlleles, final int encode) { final int offset = encode >> 1; return offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1); } } private class DPDecoder implements Decoder { @Override public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { for ( final GenotypeBuilder gb : gbs ) { // the -1 is for missing gb.DP(decoder.decodeInt(typeDescriptor, -1)); } } } private class GQDecoder implements Decoder { @Override public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { for ( final GenotypeBuilder gb : gbs ) { // the -1 is for missing gb.GQ(decoder.decodeInt(typeDescriptor, -1)); } } } private class ADDecoder implements Decoder { @Override public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { for ( final GenotypeBuilder gb : gbs ) { gb.AD(decoder.decodeIntArray(typeDescriptor, numElements)); } } } private class PLDecoder implements Decoder { @Override public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { for ( final GenotypeBuilder gb : gbs ) { gb.PL(decoder.decodeIntArray(typeDescriptor, numElements)); } } } private class GenericDecoder implements Decoder { @Override public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { for ( final GenotypeBuilder gb : gbs ) { Object value = decoder.decodeTypedValue(typeDescriptor, numElements); if ( value != null ) { // don't add missing values if ( value instanceof List && ((List)value).size() == 1) { // todo -- I really hate this, and it suggests that the code isn't completely right // the reason it's here is that it's possible to prune down a vector to a singleton // value and there we have the contract that the value comes back as an atomic value // not a vector of size 1 value = ((List)value).get(0); } gb.attribute(field, value); } } } } private class FTDecoder implements Decoder { @Override public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { for ( final GenotypeBuilder gb : gbs ) { Object value = decoder.decodeTypedValue(typeDescriptor, numElements); assert value == null || value instanceof String; gb.filter((String)value); } } } }