package picard.vcf; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.ProgressLogger; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import htsjdk.variant.vcf.VCFFileReader; import htsjdk.variant.vcf.VCFHeader; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.VcfOrBcf; import java.io.File; /** * Splits the input VCF file into two, one for indels and one for SNPs. The headers of the two output * files will be identical. * <p/> * An index file is created for the output file by default. Using an output file name with a ".gz" * extension will create gzip-compressed output. */ @CommandLineProgramProperties( usage = SplitVcfs.USAGE_SUMMARY + SplitVcfs.USAGE_DETAILS, usageShort = SplitVcfs.USAGE_SUMMARY, programGroup = VcfOrBcf.class ) public class SplitVcfs extends CommandLineProgram { static final String USAGE_SUMMARY = "Splits SNPs and INDELs into separate files. "; static final String USAGE_DETAILS = "This tool reads in a VCF or BCF file and writes out the SNPs and INDELs it contains to separate " + "files. The headers of the two output files will be identical and index files will be created for both outputs. If records " + "other than SNPs or INDELs are present, set the STRICT option to \"false\", otherwise the tool will raise an exception and " + "quit. <br />" + "<h4>Usage example:</h4>" + "<pre>" + "java -jar picard.jar SplitVcfs \\<br />" + " I=input.vcf \\<br />" + " SNP_OUTPUT=snp.vcf \\<br />" + " INDEL_OUTPUT=indel.vcf \\<br />" + " STRICT=false" + "</pre>" + "<hr />" ; @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc="The VCF or BCF input file") public File INPUT; @Option(doc = "The VCF or BCF file to which SNP records should be written. The file format is determined by file extension.") public File SNP_OUTPUT; @Option(doc = "The VCF or BCF file to which indel records should be written. The file format is determined by file extension.") public File INDEL_OUTPUT; @Option(shortName = "D", doc = "The index sequence dictionary to use instead of the sequence dictionaries in the input files", optional = true) public File SEQUENCE_DICTIONARY; @Option(doc = "If true an exception will be thrown if an event type other than SNP or indel is encountered") public Boolean STRICT = true; private final Log log = Log.getInstance(SplitVcfs.class); public static void main(final String[] argv) { new SplitVcfs().instanceMainWithExit(argv); } public SplitVcfs() { this.CREATE_INDEX = true; } @Override protected int doWork() { IOUtil.assertFileIsReadable(INPUT); final ProgressLogger progress = new ProgressLogger(log, 10000); final VCFFileReader fileReader = new VCFFileReader(INPUT); final VCFHeader fileHeader = fileReader.getFileHeader(); final SAMSequenceDictionary sequenceDictionary = SEQUENCE_DICTIONARY != null ? SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).getFileHeader(SEQUENCE_DICTIONARY).getSequenceDictionary() : fileHeader.getSequenceDictionary(); if (CREATE_INDEX && sequenceDictionary == null) { throw new PicardException("A sequence dictionary must be available (either through the input file or by setting it explicitly) when creating indexed output."); } final VariantContextWriterBuilder builder = new VariantContextWriterBuilder() .setReferenceDictionary(sequenceDictionary) .clearOptions(); if (CREATE_INDEX) builder.setOption(Options.INDEX_ON_THE_FLY); final VariantContextWriter snpWriter = builder.setOutputFile(SNP_OUTPUT).build(); final VariantContextWriter indelWriter = builder.setOutputFile(INDEL_OUTPUT).build(); snpWriter.writeHeader(fileHeader); indelWriter.writeHeader(fileHeader); int incorrectVariantCount = 0; final CloseableIterator<VariantContext> iterator = fileReader.iterator(); while (iterator.hasNext()) { final VariantContext context = iterator.next(); if (context.isIndel()) indelWriter.add(context); else if (context.isSNP()) snpWriter.add(context); else { if (STRICT) throw new IllegalStateException("Found a record with type " + context.getType().name()); else incorrectVariantCount++; } progress.record(context.getContig(), context.getStart()); } if (incorrectVariantCount > 0) { log.debug("Found " + incorrectVariantCount + " records that didn't match SNP or INDEL"); } CloserUtil.close(iterator); CloserUtil.close(fileReader); snpWriter.close(); indelWriter.close(); return 0; } }