/* * Copyright (c) 2012 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package htsjdk.variant.variantcontext; import htsjdk.tribble.FeatureCodec; import htsjdk.tribble.FeatureCodecHeader; import htsjdk.tribble.Tribble; import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.readers.LineIteratorImpl; import htsjdk.tribble.readers.LineReaderUtil; import htsjdk.tribble.readers.PositionalBufferedStream; import htsjdk.variant.VariantBaseTest; import htsjdk.variant.bcf2.BCF2Codec; import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFCodec; import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFContigHeaderLine; import htsjdk.variant.vcf.VCFFilterHeaderLine; import htsjdk.variant.vcf.VCFFormatHeaderLine; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.testng.Assert; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.EnumSet; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; /** * Routines for generating all sorts of VCs for testing * * @author Your Name * @since Date created */ public class VariantContextTestProvider { final private static boolean ENABLE_GENOTYPE_TESTS = true; final private static boolean ENABLE_A_AND_G_TESTS = true; final private static boolean ENABLE_VARARRAY_TESTS = true; final private static boolean ENABLE_PLOIDY_TESTS = true; final private static boolean ENABLE_PL_TESTS = true; final private static boolean ENABLE_SYMBOLIC_ALLELE_TESTS = true; final private static boolean ENABLE_SOURCE_VCF_TESTS = true; final private static boolean ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS = true; final private static List<Integer> TWENTY_INTS = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); private static VCFHeader syntheticHeader; final static List<VariantContextTestData> TEST_DATAs = new ArrayList<VariantContextTestData>(); private static VariantContext ROOT; private final static List<File> testSourceVCFs = new ArrayList<File>(); static { testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf")); testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "ex2.vcf")); testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "dbsnp_135.b37.1000.vcf")); if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) { testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "diagnosis_targets_testfile.vcf")); testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "VQSR.mixedTest.recal")); testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "breakpoint.vcf")); } } public static class VariantContextContainer { private VCFHeader header; private Iterable<VariantContext> vcs; public VariantContextContainer( VCFHeader header, Iterable<VariantContext> vcs ) { this.header = header; this.vcs = vcs; } public VCFHeader getHeader() { return header; } public Iterable<VariantContext> getVCs() { return vcs; } } public abstract static class VariantContextIOTest<CODECTYPE> { public String toString() { return "VariantContextIOTest:" + getExtension(); } public abstract String getExtension(); public abstract CODECTYPE makeCodec(); public abstract VariantContextWriter makeWriter(final File outputFile, final EnumSet<Options> baseOptions); public abstract VariantContextContainer readAllVCs(final File input) throws IOException; public List<VariantContext> preprocess(final VCFHeader header, List<VariantContext> vcsBeforeIO) { return vcsBeforeIO; } public List<VariantContext> postprocess(final VCFHeader header, List<VariantContext> vcsAfterIO) { return vcsAfterIO; } } public static class VariantContextTestData { public final VCFHeader header; public List<VariantContext> vcs; public VariantContextTestData(final VCFHeader header, final VariantContextBuilder builder) { this(header, Collections.singletonList(builder.fullyDecoded(true).make())); } public VariantContextTestData(final VCFHeader header, final List<VariantContext> vcs) { final Set<String> samples = new HashSet<String>(); for ( final VariantContext vc : vcs ) if ( vc.hasGenotypes() ) samples.addAll(vc.getSampleNames()); this.header = samples.isEmpty() ? header : new VCFHeader(header.getMetaDataInSortedOrder(), samples); this.vcs = vcs; } public boolean hasGenotypes() { return vcs.get(0).hasGenotypes(); } public String toString() { StringBuilder b = new StringBuilder(); b.append("VariantContextTestData: ["); final VariantContext vc = vcs.get(0); final VariantContextBuilder builder = new VariantContextBuilder(vc); builder.noGenotypes(); b.append(builder.make().toString()); if ( vc.getNSamples() < 5 ) { for ( final Genotype g : vc.getGenotypes() ) b.append(g.toString()); } else { b.append(" nGenotypes = ").append(vc.getNSamples()); } if ( vcs.size() > 1 ) b.append(" ----- with another ").append(vcs.size() - 1).append(" VariantContext records"); b.append("]"); return b.toString(); } } private final static VariantContextBuilder builder() { return new VariantContextBuilder(ROOT); } private final static void add(VariantContextBuilder builder) { TEST_DATAs.add(new VariantContextTestData(syntheticHeader, builder)); } public static void initializeTests() throws IOException { createSyntheticHeader(); makeSyntheticTests(); makeEmpiricalTests(); } private static void makeEmpiricalTests() throws IOException { if ( ENABLE_SOURCE_VCF_TESTS ) { for ( final File file : testSourceVCFs ) { VCFCodec codec = new VCFCodec(); VariantContextContainer x = readAllVCs( file, codec ); List<VariantContext> fullyDecoded = new ArrayList<VariantContext>(); for ( final VariantContext raw : x.getVCs() ) { if ( raw != null ) fullyDecoded.add(raw.fullyDecode(x.getHeader(), false)); } TEST_DATAs.add(new VariantContextTestData(x.getHeader(), fullyDecoded)); } } } private final static void addHeaderLine(final Set<VCFHeaderLine> metaData, final String id, final int count, final VCFHeaderLineType type) { metaData.add(new VCFInfoHeaderLine(id, count, type, "x")); if ( type != VCFHeaderLineType.Flag ) metaData.add(new VCFFormatHeaderLine(id, count, type, "x")); } private final static void addHeaderLine(final Set<VCFHeaderLine> metaData, final String id, final VCFHeaderLineCount count, final VCFHeaderLineType type) { metaData.add(new VCFInfoHeaderLine(id, count, type, "x")); if ( type != VCFHeaderLineType.Flag ) metaData.add(new VCFFormatHeaderLine(id, count, type, "x")); } private static void createSyntheticHeader() { Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>(); addHeaderLine(metaData, "STRING1", 1, VCFHeaderLineType.String); addHeaderLine(metaData, "END", 1, VCFHeaderLineType.Integer); addHeaderLine(metaData, "STRING3", 3, VCFHeaderLineType.String); addHeaderLine(metaData, "STRING20", 20, VCFHeaderLineType.String); addHeaderLine(metaData, "VAR.INFO.STRING", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); addHeaderLine(metaData, "GT", 1, VCFHeaderLineType.String); addHeaderLine(metaData, "GQ", 1, VCFHeaderLineType.Integer); addHeaderLine(metaData, "ADA", VCFHeaderLineCount.A, VCFHeaderLineType.Integer); addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer); addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String); addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String); // prep the header metaData.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0)); metaData.add(new VCFFilterHeaderLine("FILTER1")); metaData.add(new VCFFilterHeaderLine("FILTER2")); addHeaderLine(metaData, "INT1", 1, VCFHeaderLineType.Integer); addHeaderLine(metaData, "INT3", 3, VCFHeaderLineType.Integer); addHeaderLine(metaData, "INT20", 20, VCFHeaderLineType.Integer); addHeaderLine(metaData, "INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer); addHeaderLine(metaData, "FLOAT1", 1, VCFHeaderLineType.Float); addHeaderLine(metaData, "FLOAT3", 3, VCFHeaderLineType.Float); addHeaderLine(metaData, "FLAG", 0, VCFHeaderLineType.Flag); syntheticHeader = new VCFHeader(metaData); } private static void makeSyntheticTests() { VariantContextBuilder rootBuilder = new VariantContextBuilder(); rootBuilder.source("test"); rootBuilder.loc("1", 10, 10); rootBuilder.alleles("A", "C"); rootBuilder.unfiltered(); ROOT = rootBuilder.make(); add(builder()); add(builder().alleles("A")); add(builder().alleles("A", "C", "T")); add(builder().alleles("A", "AC")); add(builder().alleles("A", "ACAGT")); add(builder().loc("1", 10, 11).alleles("AC", "A")); add(builder().loc("1", 10, 13).alleles("ACGT", "A")); // make sure filters work add(builder().unfiltered()); add(builder().passFilters()); add(builder().filters("FILTER1")); add(builder().filters("FILTER1", "FILTER2")); add(builder().log10PError(VariantContext.NO_LOG10_PERROR)); add(builder().log10PError(-1)); add(builder().log10PError(-1.234e6)); add(builder().noID()); add(builder().id("rsID12345")); add(builder().attribute("INT1", 1)); add(builder().attribute("INT1", 100)); add(builder().attribute("INT1", 1000)); add(builder().attribute("INT1", 100000)); add(builder().attribute("INT1", null)); add(builder().attribute("INT3", Arrays.asList(1, 2, 3))); add(builder().attribute("INT3", Arrays.asList(1000, 2000, 3000))); add(builder().attribute("INT3", Arrays.asList(100000, 200000, 300000))); add(builder().attribute("INT3", null)); add(builder().attribute("INT20", TWENTY_INTS)); add(builder().attribute("FLOAT1", 1.0)); add(builder().attribute("FLOAT1", 100.0)); add(builder().attribute("FLOAT1", 1000.0)); add(builder().attribute("FLOAT1", 100000.0)); add(builder().attribute("FLOAT1", null)); add(builder().attribute("FLOAT3", Arrays.asList(1.0, 2.0, 3.0))); add(builder().attribute("FLOAT3", Arrays.asList(1000.0, 2000.0, 3000.0))); add(builder().attribute("FLOAT3", Arrays.asList(100000.0, 200000.0, 300000.0))); add(builder().attribute("FLOAT3", null)); add(builder().attribute("FLAG", true)); //add(builder().attribute("FLAG", false)); // NOTE -- VCF doesn't allow false flags add(builder().attribute("STRING1", "s1")); add(builder().attribute("STRING1", null)); add(builder().attribute("STRING3", Arrays.asList("s1", "s2", "s3"))); add(builder().attribute("STRING3", null)); add(builder().attribute("STRING20", Arrays.asList("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20"))); add(builder().attribute("VAR.INFO.STRING", "s1")); add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2"))); add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2", "s3"))); add(builder().attribute("VAR.INFO.STRING", null)); if ( ENABLE_GENOTYPE_TESTS ) { addGenotypesToTestData(); addComplexGenotypesTest(); } if ( ENABLE_A_AND_G_TESTS ) addGenotypesAndGTests(); if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) addSymbolicAlleleTests(); } private static void addSymbolicAlleleTests() { // two tests to ensure that the end is computed correctly when there's (and not) an END field present add(builder().alleles("N", "<VQSR>").start(10).stop(11).attribute("END", 11)); add(builder().alleles("N", "<VQSR>").start(10).stop(10)); } private static void addGenotypesToTestData() { final ArrayList<VariantContext> sites = new ArrayList<VariantContext>(); sites.add(builder().alleles("A").make()); sites.add(builder().alleles("A", "C", "T").make()); sites.add(builder().alleles("A", "AC").make()); sites.add(builder().alleles("A", "ACAGT").make()); for ( VariantContext site : sites ) { addGenotypes(site); } } private static void addGenotypeTests( final VariantContext site, Genotype ... genotypes ) { // for each sites VC, we are going to add create two root genotypes. // The first is the primary, and will be added to each new test // The second is variable. In some tests it's absent (testing 1 genotype), in others it is duplicated // 1 once, 10, 100, or 1000 times to test scaling final VariantContextBuilder builder = new VariantContextBuilder(site); // add a single context builder.genotypes(genotypes[0]); add(builder); if ( genotypes.length > 1 ) { // add all add(builder.genotypes(Arrays.asList(genotypes))); // add all with the last replicated 10x and 100x times for ( int nCopiesOfLast : Arrays.asList(10, 100, 1000) ) { final GenotypesContext gc = new GenotypesContext(); final Genotype last = genotypes[genotypes.length-1]; for ( int i = 0; i < genotypes.length - 1; i++ ) gc.add(genotypes[i]); for ( int i = 0; i < nCopiesOfLast; i++ ) gc.add(new GenotypeBuilder(last).name("copy" + i).make()); add(builder.genotypes(gc)); } } } private static void addGenotypes( final VariantContext site) { // test ref/ref final Allele ref = site.getReference(); final Allele alt1 = site.getNAlleles() > 1 ? site.getAlternateAllele(0) : null; final Genotype homRef = GenotypeBuilder.create("homRef", Arrays.asList(ref, ref)); addGenotypeTests(site, homRef); if ( alt1 != null ) { final Genotype het = GenotypeBuilder.create("het", Arrays.asList(ref, alt1)); final Genotype homVar = GenotypeBuilder.create("homVar", Arrays.asList(alt1, alt1)); addGenotypeTests(site, homRef, het); addGenotypeTests(site, homRef, het, homVar); // test no GT at all addGenotypeTests(site, new GenotypeBuilder("noGT", new ArrayList<Allele>(0)).attribute("INT1", 10).make()); final List<Allele> noCall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); // ploidy if ( ENABLE_PLOIDY_TESTS ) { addGenotypeTests(site, GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), GenotypeBuilder.create("hap", Arrays.asList(ref))); addGenotypeTests(site, GenotypeBuilder.create("noCall", noCall), GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), GenotypeBuilder.create("hap", Arrays.asList(ref))); addGenotypeTests(site, GenotypeBuilder.create("noCall", noCall), GenotypeBuilder.create("noCall2", noCall), GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), GenotypeBuilder.create("hap", Arrays.asList(ref))); addGenotypeTests(site, GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1))); addGenotypeTests(site, GenotypeBuilder.create("noCall", noCall), GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1))); addGenotypeTests(site, GenotypeBuilder.create("noCall", noCall), GenotypeBuilder.create("noCall2", noCall), GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1))); addGenotypeTests(site, GenotypeBuilder.create("nocall", noCall), GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1))); } // // // TESTING PHASE // // final Genotype gUnphased = new GenotypeBuilder("gUnphased", Arrays.asList(ref, alt1)).make(); final Genotype gPhased = new GenotypeBuilder("gPhased", Arrays.asList(ref, alt1)).phased(true).make(); final Genotype gPhased2 = new GenotypeBuilder("gPhased2", Arrays.asList(alt1, alt1)).phased(true).make(); final Genotype gPhased3 = new GenotypeBuilder("gPhased3", Arrays.asList(ref, ref)).phased(true).make(); final Genotype haploidNoPhase = new GenotypeBuilder("haploidNoPhase", Arrays.asList(ref)).make(); addGenotypeTests(site, gUnphased, gPhased); addGenotypeTests(site, gUnphased, gPhased2); addGenotypeTests(site, gUnphased, gPhased3); addGenotypeTests(site, gPhased, gPhased2); addGenotypeTests(site, gPhased, gPhased3); addGenotypeTests(site, gPhased2, gPhased3); addGenotypeTests(site, haploidNoPhase, gPhased); addGenotypeTests(site, haploidNoPhase, gPhased2); addGenotypeTests(site, haploidNoPhase, gPhased3); addGenotypeTests(site, haploidNoPhase, gPhased, gPhased2); addGenotypeTests(site, haploidNoPhase, gPhased, gPhased3); addGenotypeTests(site, haploidNoPhase, gPhased2, gPhased3); addGenotypeTests(site, haploidNoPhase, gPhased, gPhased2, gPhased3); final Genotype gUnphasedTet = new GenotypeBuilder("gUnphasedTet", Arrays.asList(ref, alt1, ref, alt1)).make(); final Genotype gPhasedTet = new GenotypeBuilder("gPhasedTet", Arrays.asList(ref, alt1, alt1, alt1)).phased(true).make(); addGenotypeTests(site, gUnphasedTet, gPhasedTet); } if ( ENABLE_PL_TESTS ) { if ( site.getNAlleles() == 2 ) { // testing PLs addGenotypeTests(site, GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{0, -1, -2}), GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2, -3})); addGenotypeTests(site, GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{-1, 0, -2}), GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2, -3})); addGenotypeTests(site, GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{-1, 0, -2}), GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2000, -1000})); addGenotypeTests(site, // missing PLs GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{-1, 0, -2}), GenotypeBuilder.create("g2", Arrays.asList(ref, ref))); } else if ( site.getNAlleles() == 3 ) { // testing PLs addGenotypeTests(site, GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{0, -1, -2, -3, -4, -5}), GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2, -3, -4, -5, -6})); } } // test attributes addGenotypeTests(site, attr("g1", ref, "INT1", 1), attr("g2", ref, "INT1", 2)); addGenotypeTests(site, attr("g1", ref, "INT1", 1), attr("g2", ref, "INT1")); addGenotypeTests(site, attr("g1", ref, "INT3", 1, 2, 3), attr("g2", ref, "INT3", 4, 5, 6)); addGenotypeTests(site, attr("g1", ref, "INT3", 1, 2, 3), attr("g2", ref, "INT3")); addGenotypeTests(site, attr("g1", ref, "INT20", TWENTY_INTS), attr("g2", ref, "INT20", TWENTY_INTS)); if (ENABLE_VARARRAY_TESTS) { addGenotypeTests(site, attr("g1", ref, "INT.VAR", 1, 2, 3), attr("g2", ref, "INT.VAR", 4, 5), attr("g3", ref, "INT.VAR", 6)); addGenotypeTests(site, attr("g1", ref, "INT.VAR", 1, 2, 3), attr("g2", ref, "INT.VAR"), attr("g3", ref, "INT.VAR", 5)); } addGenotypeTests(site, attr("g1", ref, "FLOAT1", 1.0), attr("g2", ref, "FLOAT1", 2.0)); addGenotypeTests(site, attr("g1", ref, "FLOAT1", 1.0), attr("g2", ref, "FLOAT1")); addGenotypeTests(site, attr("g1", ref, "FLOAT3", 1.0, 2.0, 3.0), attr("g2", ref, "FLOAT3", 4.0, 5.0, 6.0)); addGenotypeTests(site, attr("g1", ref, "FLOAT3", 1.0, 2.0, 3.0), attr("g2", ref, "FLOAT3")); if (ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS) { // // // TESTING MULTIPLE SIZED LISTS IN THE GENOTYPE FIELD // // addGenotypeTests(site, attr("g1", ref, "GS", Arrays.asList("S1", "S2")), attr("g2", ref, "GS", Arrays.asList("S3", "S4"))); addGenotypeTests(site, // g1 is missing the string, and g2 is missing FLOAT1 attr("g1", ref, "FLOAT1", 1.0), attr("g2", ref, "GS", Arrays.asList("S3", "S4"))); // variable sized lists addGenotypeTests(site, attr("g1", ref, "GV", "S1"), attr("g2", ref, "GV", Arrays.asList("S3", "S4"))); addGenotypeTests(site, attr("g1", ref, "GV", Arrays.asList("S1", "S2")), attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5"))); addGenotypeTests(site, // missing value in varlist of string attr("g1", ref, "FLOAT1", 1.0), attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5"))); } // // // TESTING GENOTYPE FILTERS // // addGenotypeTests(site, new GenotypeBuilder("g1-x", Arrays.asList(ref, ref)).filters("X").make(), new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make()); addGenotypeTests(site, new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make()); addGenotypeTests(site, new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), new GenotypeBuilder("g2-xy", Arrays.asList(ref, ref)).filters("X", "Y").make()); addGenotypeTests(site, new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make(), new GenotypeBuilder("g3-xy", Arrays.asList(ref, ref)).filters("X", "Y").make()); } private static void addGenotypesAndGTests() { // for ( final int ploidy : Arrays.asList(2)) { for ( final int ploidy : Arrays.asList(1, 2, 3, 4, 5)) { final List<List<String>> alleleCombinations = Arrays.asList( Arrays.asList("A"), Arrays.asList("A", "C"), Arrays.asList("A", "C", "G"), Arrays.asList("A", "C", "G", "T")); for ( final List<String> alleles : alleleCombinations ) { final VariantContextBuilder vcb = builder().alleles(alleles); final VariantContext site = vcb.make(); final int nAlleles = site.getNAlleles(); final Allele ref = site.getReference(); // base genotype is ref/.../ref up to ploidy final List<Allele> baseGenotype = new ArrayList<Allele>(ploidy); for ( int i = 0; i < ploidy; i++) baseGenotype.add(ref); final int nPLs = GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy); // ada is 0, 1, ..., nAlleles - 1 final List<Integer> ada = new ArrayList<Integer>(nAlleles); for ( int i = 0; i < nAlleles - 1; i++ ) ada.add(i); // pl is 0, 1, ..., up to nPLs (complex calc of nAlleles and ploidy) final int[] pl = new int[nPLs]; for ( int i = 0; i < pl.length; i++ ) pl[i] = i; final GenotypeBuilder gb = new GenotypeBuilder("ADA_PL_SAMPLE"); gb.alleles(baseGenotype); gb.PL(pl); gb.attribute("ADA", nAlleles == 2 ? ada.get(0) : ada); vcb.genotypes(gb.make()); add(vcb); } } } private static Genotype attr(final String name, final Allele ref, final String key, final Object ... value) { if ( value.length == 0 ) return GenotypeBuilder.create(name, Arrays.asList(ref, ref)); else { final Object toAdd = value.length == 1 ? value[0] : Arrays.asList(value); return new GenotypeBuilder(name, Arrays.asList(ref, ref)).attribute(key, toAdd).make(); } } public static List<VariantContextTestData> generateSiteTests() { return TEST_DATAs; } public static void testReaderWriterWithMissingGenotypes(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { final int nSamples = data.header.getNGenotypeSamples(); if ( nSamples > 2 ) { for ( final VariantContext vc : data.vcs ) if ( vc.isSymbolic() ) // cannot handle symbolic alleles because they may be weird non-call VCFs return; final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension()); tmpFile.deleteOnExit(); Tribble.indexFile(tmpFile).deleteOnExit(); // write expected to disk final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY); final VariantContextWriter writer = tester.makeWriter(tmpFile, options); final Set<String> samplesInVCF = new HashSet<String>(data.header.getGenotypeSamples()); final List<String> missingSamples = Arrays.asList("MISSING1", "MISSING2"); final List<String> allSamples = new ArrayList<String>(missingSamples); allSamples.addAll(samplesInVCF); final VCFHeader header = new VCFHeader(data.header.getMetaDataInInputOrder(), allSamples); writeVCsToFile(writer, header, data.vcs); // ensure writing of expected == actual final VariantContextContainer p = tester.readAllVCs(tmpFile); final Iterable<VariantContext> actual = p.getVCs(); int i = 0; for ( final VariantContext readVC : actual ) { if ( readVC == null ) continue; // sometimes we read null records... final VariantContext expected = data.vcs.get(i++); for ( final Genotype g : readVC.getGenotypes() ) { Assert.assertTrue(allSamples.contains(g.getSampleName())); if ( samplesInVCF.contains(g.getSampleName()) ) { assertEquals(g, expected.getGenotype(g.getSampleName())); } else { // missing Assert.assertTrue(g.isNoCall()); } } } } } public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { testReaderWriter(tester, data.header, data.vcs, data.vcs, true); } public static void testReaderWriter(final VariantContextIOTest tester, final VCFHeader header, final List<VariantContext> expected, final Iterable<VariantContext> vcs, final boolean recurse) throws IOException { final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension()); tmpFile.deleteOnExit(); Tribble.indexFile(tmpFile).deleteOnExit(); // write expected to disk final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY); final VariantContextWriter writer = tester.makeWriter(tmpFile, options); writeVCsToFile(writer, header, vcs); // ensure writing of expected == actual final VariantContextContainer p = tester.readAllVCs(tmpFile); final Iterable<VariantContext> actual = p.getVCs(); assertEquals(actual, expected); if ( recurse ) { // if we are doing a recursive test, grab a fresh iterator over the written values final Iterable<VariantContext> read = tester.readAllVCs(tmpFile).getVCs(); testReaderWriter(tester, p.getHeader(), expected, read, false); } } private static void writeVCsToFile(final VariantContextWriter writer, final VCFHeader header, final Iterable<VariantContext> vcs) { // write writer.writeHeader(header); for ( VariantContext vc : vcs ) if (vc != null) writer.add(vc); writer.close(); } public static abstract class VCIterable<SOURCE> implements Iterable<VariantContext>, Iterator<VariantContext> { final FeatureCodec<VariantContext, SOURCE> codec; final VCFHeader header; public VCIterable(final FeatureCodec<VariantContext, SOURCE> codec, final VCFHeader header) { this.codec = codec; this.header = header; } @Override public Iterator<VariantContext> iterator() { return this; } @Override public abstract boolean hasNext(); public abstract SOURCE nextSource(); @Override public VariantContext next() { try { final VariantContext vc = codec.decode(nextSource()); return vc == null ? null : vc.fullyDecode(header, false); } catch ( IOException e ) { throw new RuntimeException(e); } } @Override public void remove() { } } public static VariantContextContainer readAllVCs(final File input, final BCF2Codec codec) throws IOException { PositionalBufferedStream headerPbs = new PositionalBufferedStream(new FileInputStream(input)); FeatureCodecHeader header = codec.readHeader(headerPbs); headerPbs.close(); final PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(input)); pbs.skip(header.getHeaderEnd()); final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); return new VariantContextTestProvider.VariantContextContainer(vcfHeader, new VariantContextTestProvider.VCIterable(codec, vcfHeader) { @Override public boolean hasNext() { try { return !pbs.isDone(); } catch (IOException e) { throw new RuntimeException(e); } } @Override public Object nextSource() { return pbs; } }); } public static VariantContextContainer readAllVCs(final File input, final VCFCodec codec) throws FileNotFoundException { final LineIterator lineIterator = new LineIteratorImpl(LineReaderUtil.fromBufferedStream(new BufferedInputStream(new FileInputStream(input)))); final VCFHeader vcfHeader = (VCFHeader) codec.readActualHeader(lineIterator); return new VariantContextTestProvider.VariantContextContainer(vcfHeader, new VariantContextTestProvider.VCIterable<LineIterator>(codec, vcfHeader) { @Override public boolean hasNext() { return lineIterator.hasNext(); } @Override public LineIterator nextSource() { return lineIterator; } }); } public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException { final VariantContextContainer vcfData = readAllVCs(vcfFile, new VCFCodec()); final VariantContextContainer bcfData = readAllVCs(bcfFile, new BCF2Codec()); assertEquals(bcfData.getHeader(), vcfData.getHeader()); assertEquals(bcfData.getVCs(), vcfData.getVCs()); } public static void assertEquals(final Iterable<VariantContext> actual, final Iterable<VariantContext> expected) { final Iterator<VariantContext> actualIT = actual.iterator(); final Iterator<VariantContext> expectedIT = expected.iterator(); while ( expectedIT.hasNext() ) { final VariantContext expectedVC = expectedIT.next(); if ( expectedVC == null ) continue; VariantContext actualVC; do { Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual"); actualVC = actualIT.next(); } while ( actualIT.hasNext() && actualVC == null ); if ( actualVC == null ) Assert.fail("Too few records in actual"); assertEquals(actualVC, expectedVC); } Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual"); } /** * Assert that two variant contexts are actually equal * @param actual * @param expected */ public static void assertEquals( final VariantContext actual, final VariantContext expected ) { Assert.assertNotNull(actual, "VariantContext expected not null"); Assert.assertEquals(actual.getChr(), expected.getChr(), "chr"); Assert.assertEquals(actual.getStart(), expected.getStart(), "start"); Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end"); Assert.assertEquals(actual.getID(), expected.getID(), "id"); Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual); assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied"); Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered"); VariantBaseTest.assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters"); VariantBaseTest.assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes"); if ( expected.hasGenotypes() ) { VariantBaseTest.assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set"); Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names"); final Set<String> samples = expected.getSampleNames(); for ( final String sample : samples ) { assertEquals(actual.getGenotype(sample), expected.getGenotype(sample)); } } } public static void assertEquals(final Genotype actual, final Genotype expected) { Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names"); Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles"); Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string"); Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type"); // filters are the same Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields"); Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered"); // inline attributes Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp"); Assert.assertTrue(Arrays.equals(actual.getAD(), expected.getAD())); Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq"); Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL"); Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD"); Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ"); Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP"); Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods"); Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString"); Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods"); Assert.assertTrue(Arrays.equals(actual.getPL(), expected.getPL())); Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual"); assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes()); Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased"); Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy"); } private static void assertAttributesEquals(final Map<String, Object> actual, Map<String, Object> expected) { final Set<String> expectedKeys = new HashSet<String>(expected.keySet()); for ( final Map.Entry<String, Object> act : actual.entrySet() ) { final Object actualValue = act.getValue(); if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) { final Object expectedValue = expected.get(act.getKey()); if ( expectedValue instanceof List ) { final List<Object> expectedList = (List<Object>)expectedValue; Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't"); final List<Object> actualList = (List<Object>)actualValue; Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size"); for ( int i = 0; i < expectedList.size(); i++ ) assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i)); } else assertAttributeEquals(act.getKey(), actualValue, expectedValue); } else { // it's ok to have a binding in x -> null that's absent in y Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other"); } expectedKeys.remove(act.getKey()); } // now expectedKeys contains only the keys found in expected but not in actual, // and they must all be null for ( final String missingExpected : expectedKeys ) { final Object value = expected.get(missingExpected); Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" ); } } private static final boolean isMissing(final Object value) { if ( value == null ) return true; else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true; else if ( value instanceof List ) { // handles the case where all elements are null or the list is empty for ( final Object elt : (List)value) if ( elt != null ) return false; return true; } else return false; } private static void assertAttributeEquals(final String key, final Object actual, final Object expected) { if ( expected instanceof Double ) { // must be very tolerant because doubles are being rounded to 2 sig figs VariantBaseTest.assertEqualsDoubleSmart(actual, (Double)expected, 1e-2); } else Assert.assertEquals(actual, expected, "Attribute " + key); } public static void addComplexGenotypesTest() { final List<Allele> allAlleles = Arrays.asList( Allele.create("A", true), Allele.create("C", false), Allele.create("G", false)); for ( int nAlleles : Arrays.asList(2, 3) ) { for ( int highestPloidy : Arrays.asList(1, 2, 3) ) { // site alleles final List<Allele> siteAlleles = allAlleles.subList(0, nAlleles); // possible alleles for genotypes final List<Allele> possibleGenotypeAlleles = new ArrayList<Allele>(siteAlleles); possibleGenotypeAlleles.add(Allele.NO_CALL); // there are n^ploidy possible genotypes final List<List<Allele>> possibleGenotypes = makeAllGenotypes(possibleGenotypeAlleles, highestPloidy); final int nPossibleGenotypes = possibleGenotypes.size(); VariantContextBuilder vb = new VariantContextBuilder("unittest", "1", 1, 1, siteAlleles); // first test -- create n copies of each genotype for ( int i = 0; i < nPossibleGenotypes; i++ ) { final List<Genotype> samples = new ArrayList<Genotype>(3); samples.add(GenotypeBuilder.create("sample" + i, possibleGenotypes.get(i))); add(vb.genotypes(samples)); } // second test -- create one sample with each genotype { final List<Genotype> samples = new ArrayList<Genotype>(nPossibleGenotypes); for ( int i = 0; i < nPossibleGenotypes; i++ ) { samples.add(GenotypeBuilder.create("sample" + i, possibleGenotypes.get(i))); } add(vb.genotypes(samples)); } // test mixed ploidy for ( int i = 0; i < nPossibleGenotypes; i++ ) { for ( int ploidy = 1; ploidy < highestPloidy; ploidy++ ) { final List<Genotype> samples = new ArrayList<Genotype>(highestPloidy); final List<Allele> genotype = possibleGenotypes.get(i).subList(0, ploidy); samples.add(GenotypeBuilder.create("sample" + i, genotype)); add(vb.genotypes(samples)); } } } } } private static List<List<Allele>> makeAllGenotypes(final List<Allele> alleles, final int highestPloidy) { return GeneralUtils.makePermutations(alleles, highestPloidy, true); } public static void assertEquals(final VCFHeader actual, final VCFHeader expected) { Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines"); // for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted? //Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder()); final List<VCFHeaderLine> actualLines = new ArrayList<VCFHeaderLine>(actual.getMetaDataInSortedOrder()); final List<VCFHeaderLine> expectedLines = new ArrayList<VCFHeaderLine>(expected.getMetaDataInSortedOrder()); for ( int i = 0; i < actualLines.size(); i++ ) { Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines"); } } public static void main( String argv[] ) { final File variants1 = new File(argv[0]); final File variants2 = new File(argv[1]); try { VariantContextTestProvider.assertVCFandBCFFilesAreTheSame(variants1, variants2); } catch ( IOException e ) { throw new RuntimeException(e); } } }