package picard.vcf; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.ProgressLogger; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import htsjdk.variant.vcf.VCFFileReader; import htsjdk.variant.vcf.VCFHeader; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.VcfOrBcf; import java.io.File; /** * Splits the input VCF file into two, one for indels and one for SNPs. The headers of the two output * files will be identical. * * An index file is created for the output file by default. Using an output file name with a ".gz" * extension will create gzip-compressed output. */ @CommandLineProgramProperties( usage = "Splits an input VCF or BCF file into two VCF files, one for indel records and one for SNPs. The" + "headers of the two output files will be identical. An index file is created and a" + "sequence dictionary is required by default.", usageShort = "Splits an input VCF or BCF file into two VCF or BCF files", programGroup = VcfOrBcf.class ) public class SplitVcfs extends CommandLineProgram { @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc="The VCF or BCF input file") public File INPUT; @Option(doc="The VCF or BCF file to which SNP records should be written. The file format is determined by file extension.") public File SNP_OUTPUT; @Option(doc="The VCF or BCF file to which indel records should be written. The file format is determined by file extension.") public File INDEL_OUTPUT; @Option(shortName="D", doc="The index sequence dictionary to use instead of the sequence dictionaries in the input files", optional = true) public File SEQUENCE_DICTIONARY; @Option(doc="If true an exception will be thrown if an event type other than SNP or indel is encountered") public Boolean STRICT = true; private final Log log = Log.getInstance(SplitVcfs.class); public static void main(final String[] argv) { new SplitVcfs().instanceMainWithExit(argv); } public SplitVcfs() { this.CREATE_INDEX = true; } @Override protected int doWork() { IOUtil.assertFileIsReadable(INPUT); final ProgressLogger progress = new ProgressLogger(log, 10000); final VCFFileReader fileReader = new VCFFileReader(INPUT); final VCFHeader fileHeader = fileReader.getFileHeader(); final SAMSequenceDictionary sequenceDictionary = SEQUENCE_DICTIONARY != null ? SAMFileReader.getSequenceDictionary(SEQUENCE_DICTIONARY) : fileHeader.getSequenceDictionary(); if (CREATE_INDEX && sequenceDictionary == null) { throw new PicardException("A sequence dictionary must be available (either through the input file or by setting it explicitly) when creating indexed output."); } final VariantContextWriterBuilder builder = new VariantContextWriterBuilder() .setReferenceDictionary(sequenceDictionary) .clearOptions(); if (CREATE_INDEX) builder.setOption(Options.INDEX_ON_THE_FLY); final VariantContextWriter snpWriter = builder.setOutputFile(SNP_OUTPUT).build(); final VariantContextWriter indelWriter = builder.setOutputFile(INDEL_OUTPUT).build(); snpWriter.writeHeader(fileHeader); indelWriter.writeHeader(fileHeader); int incorrectVariantCount = 0; final CloseableIterator<VariantContext> iterator = fileReader.iterator(); while (iterator.hasNext()) { final VariantContext context = iterator.next(); if (context.isIndel()) indelWriter.add(context); else if (context.isSNP()) snpWriter.add(context); else { if (STRICT) throw new IllegalStateException("Found a record with type " + context.getType().name()); else incorrectVariantCount++; } progress.record(context.getChr(), context.getStart()); } if (incorrectVariantCount > 0) { log.debug("Found " + incorrectVariantCount + " records that didn't match SNP or INDEL"); } CloserUtil.close(iterator); CloserUtil.close(fileReader); snpWriter.close(); indelWriter.close(); return 0; } }