package picard.vcf; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.ProgressLogger; import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import htsjdk.variant.vcf.VCFFileReader; import htsjdk.variant.vcf.VCFHeader; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.VcfOrBcf; import java.io.File; import java.util.Set; import java.util.TreeSet; /** * Writes out a VCF that contains all the site-level information for all records in the input VCF and no per-sample information. * * @author Tim Fennell */ @CommandLineProgramProperties( usage = "Reads a VCF/VCF.gz/BCF and removes all genotype information from it while retaining " + "all site level information, including annotations based on genotypes (e.g. AN, AF). Output an be " + "any support variant format including .vcf, .vcf.gz or .bcf.", usageShort = "Creates a VCF bereft of genotype information from an input VCF or BCF", programGroup = VcfOrBcf.class ) public class MakeSitesOnlyVcf extends CommandLineProgram { @Option(shortName= StandardOptionDefinitions.INPUT_SHORT_NAME, doc="Input VCF or BCF") public File INPUT; @Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="Output VCF or BCF to emit without per-sample info.") public File OUTPUT; @Option(shortName="S", doc="Optionally one or more samples to retain when building the 'sites-only' VCF.", optional=true) public Set<String> SAMPLE = new TreeSet<String>(); // Stock main method public static void main(final String[] args) { new MakeSitesOnlyVcf().instanceMainWithExit(args); } public MakeSitesOnlyVcf() { CREATE_INDEX = true; } @Override protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); final VCFFileReader reader = new VCFFileReader(INPUT, false); final VCFHeader inputVcfHeader = new VCFHeader(reader.getFileHeader().getMetaDataInInputOrder()); final SAMSequenceDictionary sequenceDictionary = inputVcfHeader.getSequenceDictionary(); if (CREATE_INDEX && sequenceDictionary == null) { throw new PicardException("A sequence dictionary must be available (either through the input file or by setting it explicitly) when creating indexed output."); } final ProgressLogger progress = new ProgressLogger(Log.getInstance(MakeSitesOnlyVcf.class), 10000); // Setup the site-only file writer final VariantContextWriterBuilder builder = new VariantContextWriterBuilder() .setOutputFile(OUTPUT) .setReferenceDictionary(sequenceDictionary); if (CREATE_INDEX) builder.setOption(Options.INDEX_ON_THE_FLY); else builder.unsetOption(Options.INDEX_ON_THE_FLY); final VariantContextWriter writer = builder.build(); final VCFHeader header = new VCFHeader(inputVcfHeader.getMetaDataInInputOrder(), SAMPLE); writer.writeHeader(header); // Go through the input, strip the records and write them to the output final CloseableIterator<VariantContext> iterator = reader.iterator(); while (iterator.hasNext()) { final VariantContext full = iterator.next(); final VariantContext site = subsetToSamplesWithOriginalAnnotations(full, SAMPLE); writer.add(site); progress.record(site.getChr(), site.getStart()); } CloserUtil.close(iterator); CloserUtil.close(reader); writer.close(); return 0; } /** Makes a new VariantContext with only the desired samples. */ private static VariantContext subsetToSamplesWithOriginalAnnotations(final VariantContext ctx, final Set<String> samples) { final VariantContextBuilder builder = new VariantContextBuilder(ctx); final GenotypesContext newGenotypes = ctx.getGenotypes().subsetToSamples(samples); builder.alleles(ctx.getAlleles()); return builder.genotypes(newGenotypes).make(); } }