/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package htsjdk.variant.variantcontext.writer;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.TestUtil;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.FeatureReader;
import htsjdk.tribble.Tribble;
import htsjdk.tribble.util.TabixUtils;
import htsjdk.variant.VariantBaseTest;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFHeaderVersion;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.testng.Assert;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
/**
* @author aaron
* <p/>
* Class VCFWriterUnitTest
* <p/>
* This class tests out the ability of the VCF writer to correctly write VCF files
*/
public class VCFWriterUnitTest extends VariantBaseTest {
private Set<VCFHeaderLine> metaData;
private Set<String> additionalColumns;
private File tempDir;
@BeforeClass
private void createTemporaryDirectory() {
tempDir = TestUtil.getTempDirectory("VCFWriter", "StaleIndex");
}
@AfterClass
private void deleteTemporaryDirectory() {
for (File f : tempDir.listFiles()) {
f.delete();
}
tempDir.delete();
}
/** test, using the writer and reader, that we can output and input a VCF file without problems */
@Test(dataProvider = "vcfExtensionsDataProvider")
public void testBasicWriteAndRead(final String extension) throws IOException {
final File fakeVCFFile = File.createTempFile("testBasicWriteAndRead.", extension);
fakeVCFFile.deleteOnExit();
if (".vcf.gz".equals(extension)) {
new File(fakeVCFFile.getAbsolutePath() + ".tbi").deleteOnExit();
} else {
Tribble.indexFile(fakeVCFFile).deleteOnExit();
}
metaData = new HashSet<VCFHeaderLine>();
additionalColumns = new HashSet<String>();
final SAMSequenceDictionary sequenceDict = createArtificialSequenceDictionary();
final VCFHeader header = createFakeHeader(metaData, additionalColumns, sequenceDict);
final VariantContextWriter writer = new VariantContextWriterBuilder()
.setOutputFile(fakeVCFFile)
.setReferenceDictionary(sequenceDict)
.setOptions(EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Options.INDEX_ON_THE_FLY))
.build();
writer.writeHeader(header);
writer.add(createVC(header));
writer.add(createVC(header));
writer.close();
final VCFCodec codec = new VCFCodec();
final FeatureReader<VariantContext> reader = AbstractFeatureReader.getFeatureReader(fakeVCFFile.getAbsolutePath(), codec, false);
final VCFHeader headerFromFile = (VCFHeader)reader.getHeader();
int counter = 0;
// validate what we're reading in
validateHeader(headerFromFile, sequenceDict);
try {
final Iterator<VariantContext> it = reader.iterator();
while(it.hasNext()) {
it.next();
counter++;
}
Assert.assertEquals(counter, 2);
}
catch (final IOException e ) {
throw new RuntimeException(e.getMessage());
}
}
/**
* create a fake header of known quantity
* @param metaData the header lines
* @param additionalColumns the additional column names
* @return a fake VCF header
*/
public static VCFHeader createFakeHeader(final Set<VCFHeaderLine> metaData, final Set<String> additionalColumns,
final SAMSequenceDictionary sequenceDict) {
metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString()));
metaData.add(new VCFHeaderLine("two", "2"));
additionalColumns.add("extra1");
additionalColumns.add("extra2");
final VCFHeader ret = new VCFHeader(metaData, additionalColumns);
ret.setSequenceDictionary(sequenceDict);
return ret;
}
/**
* create a fake VCF record
* @param header the VCF header
* @return a VCFRecord
*/
private VariantContext createVC(final VCFHeader header) {
return createVCGeneral(header,"1",1);
}
private VariantContext createVCGeneral(final VCFHeader header, final String chrom, final int position) {
final List<Allele> alleles = new ArrayList<Allele>();
final Map<String, Object> attributes = new HashMap<String,Object>();
final GenotypesContext genotypes = GenotypesContext.create(header.getGenotypeSamples().size());
alleles.add(Allele.create("A",true));
alleles.add(Allele.create("ACC",false));
attributes.put("DP","50");
for (final String name : header.getGenotypeSamples()) {
final Genotype gt = new GenotypeBuilder(name,alleles.subList(1,2)).GQ(0).attribute("BB", "1").phased(true).make();
genotypes.add(gt);
}
return new VariantContextBuilder("RANDOM", chrom, position, position, alleles)
.genotypes(genotypes).attributes(attributes).make();
}
/**
* validate a VCF header
* @param header the header to validate
*/
public void validateHeader(final VCFHeader header, final SAMSequenceDictionary sequenceDictionary) {
// check the fields
int index = 0;
for (final VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) {
Assert.assertEquals(VCFHeader.HEADER_FIELDS.values()[index], field);
index++;
}
Assert.assertEquals(header.getMetaDataInSortedOrder().size(), metaData.size() + sequenceDictionary.size());
index = 0;
for (final String key : header.getGenotypeSamples()) {
Assert.assertTrue(additionalColumns.contains(key));
index++;
}
Assert.assertEquals(index, additionalColumns.size());
}
@Test(dataProvider = "vcfExtensionsDataProvider")
public void TestWritingLargeVCF(final String extension) throws FileNotFoundException, InterruptedException {
final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
final Set<String> Columns = new HashSet<String>();
for (int i = 0; i < 123; i++) {
Columns.add(String.format("SAMPLE_%d", i));
}
final SAMSequenceDictionary dict = createArtificialSequenceDictionary();
final VCFHeader header = createFakeHeader(metaData,Columns, dict);
final File vcf = new File(tempDir, "test" + extension);
final String indexExtension;
if (extension.equals(".vcf.gz")) {
indexExtension = TabixUtils.STANDARD_INDEX_EXTENSION;
} else {
indexExtension = Tribble.STANDARD_INDEX_EXTENSION;
}
final File vcfIndex = new File(vcf.getAbsolutePath() + indexExtension);
vcfIndex.deleteOnExit();
for(int count=1;count<2; count++){
final VariantContextWriter writer = new VariantContextWriterBuilder()
.setOutputFile(vcf)
.setReferenceDictionary(dict)
.setOptions(EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Options.INDEX_ON_THE_FLY))
.build();
writer.writeHeader(header);
for (int i = 1; i < 17 ; i++) { // write 17 chromosomes
for (int j = 1; j < 10; j++) { //10 records each
writer.add(createVCGeneral(header, String.format("%d", i), j * 100));
}
}
writer.close();
Assert.assertTrue(vcf.lastModified() <= vcfIndex.lastModified());
}
}
@DataProvider(name = "vcfExtensionsDataProvider")
public Object[][]vcfExtensionsDataProvider() {
return new Object[][] {
// TODO: BCF doesn't work because header is not properly constructed.
// {".bcf"},
{".vcf"},
{".vcf.gz"}
};
}
}