/* The MIT License (MIT) Copyright (c) 2015 Pierre Lindenbaum Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package com.github.lindenb.jvarkit.tools.misc; import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileHeader.SortOrder; import htsjdk.samtools.CigarElement; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.SequenceUtil; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.TreeSet; import com.beust.jcommander.Parameter; import com.beust.jcommander.ParametersDelegate; import com.github.lindenb.jvarkit.util.jcommander.Launcher; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; import com.github.lindenb.jvarkit.util.picard.GenomicSequence; import com.github.lindenb.jvarkit.util.vcf.VcfIterator; /* BEGIN_DOC ##Example ```bash $ java -jar dist-1.139/vcf2bam.jar -R ref.fa samtools.vcf.gz 2> /dev/null | grep -v "100=" @HD VN:1.5 SO:unsorted @SQ SN:seg1 LN:5101 @SQ SN:seg2 LN:4000 @RG ID:NA12878 SM:NA12878 LB:illumina DS:NA12878 @RG ID:NA12891 SM:NA12891 LB:illumina DS:NA12891 @RG ID:NA12892 SM:NA12892 LB:illumina DS:NA12892 @CO Generated with -R /home/lindenb/src/ngsxml/test/ref/ref.fa /home/lindenb/src/ngsxml/OUT/Projects/Proj1/VCF/samtools/Proj1.samtools.vcf.gz 0000003609 83 seg1 1449 60 99=1X = 949 -500 TGCTTTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTCCTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:NA12891 NM:i:1 0000003610 147 seg1 1449 60 99=1X = 949 -500 TGCTTTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTCCTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:NA12891 NM:i:1 0000003611 147 seg1 1449 60 99=1X = 949 -500 TGCTTTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTCCTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:NA12892 NM:i:1 0000003612 83 seg1 1449 60 99=1X = 949 -500 TGCTTTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTCCTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:NA12892 NM:i:1 0000003615 83 seg1 1450 60 98=1X1= = 950 -500 GCTTTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTCCTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTCT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:NA12891 NM:i:1 0000003616 147 seg1 1450 60 98=1X1= = 950 -500 GCTTTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTCCTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTCT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:NA12891 NM:i:1 0000003617 147 seg1 1450 60 98=1X1= = 950 -500 GCTTTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTCCTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTCT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:NA12892 NM:i:1 0000003627 147 seg1 1452 60 96=1X3= = 952 -500 TTTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTCCTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTCTCT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:NA12891 NM:i:1 0000003629 147 seg1 1452 60 96=1X3= = 952 -500 TTTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTCCTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTCTCT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:NA12892 NM:i:1 0000003630 147 seg1 1452 60 96=1X3= = 952 -500 TTTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTCCTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTCTCT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:NA12892 NM:i:1 0000003633 147 seg1 1453 60 95=1X4= = 953 -500 TTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTCCTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTCTCTC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:NA12891 NM:i:1 0000003635 83 seg1 1453 60 95=1X4= = 953 -500 TTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTCCTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTCTCTC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:NA12892 NM:i:1 0000003640 147 seg1 1454 60 94=1X5= = 954 -500 TAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTCCTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTCTCTCA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII RG:Z:NA12891 NM:i:1 ``` END_DOC */ @Program(name="vcf2bam",description="vcf to bam") public class VcfToBam extends Launcher { private static final Logger LOG=Logger.build(VcfToBam.class).make(); @Parameter(names={"-o","--output"},description="Ouput file. Default: stdout") private File outputFile=null; @Parameter(names={"-r","-R","--reference"},description="indexed fasta reference",required=true) private File faidx=null; @ParametersDelegate private WritingBamArgs writingBamArgs=new WritingBamArgs(); private IndexedFastaSequenceFile indexedFastaSequenceFile; @Parameter(names="--fragmentsize",description="fragment size") private int fragmentSize=600; @Parameter(names="--readsize",description="read size") private int readSize=100; private void run(VcfIterator vcfIterator) throws IOException { long id_generator=0L; SAMFileWriter samFileWriter =null; final VCFHeader header = vcfIterator.getHeader(); SAMSequenceDictionary dict = header.getSequenceDictionary(); if(dict==null) throw new IOException("Sequence Dictionary missing in VCF"); if(!SequenceUtil.areSequenceDictionariesEqual(dict, indexedFastaSequenceFile.getSequenceDictionary())) { LOG.warning( "REF/VCF: not the same Sequence dictonary "+dict+" "+ indexedFastaSequenceFile.getSequenceDictionary() ); } final SAMFileHeader samHeader=new SAMFileHeader(); samHeader.setSequenceDictionary(dict); for(final String sample:new TreeSet<>(header.getSampleNamesInOrder())) { final SAMReadGroupRecord rg= new SAMReadGroupRecord(sample); rg.setSample(sample); rg.setLibrary(sample); rg.setDescription(sample); rg.setLibrary("illumina"); samHeader.addReadGroup(rg); } samHeader.addComment("Generated with "+getProgramCommandLine()); samHeader.setSortOrder(SortOrder.unsorted); samFileWriter= this.writingBamArgs. setReferenceFile(this.faidx). openSAMFileWriter(this.outputFile, samHeader, true); /* looping over sequences */ for(final SAMSequenceRecord ssr: dict.getSequences()) { final LinkedList<VariantContext> variantBuffer = new LinkedList<>(); GenomicSequence genomicSequence=new GenomicSequence(this.indexedFastaSequenceFile, ssr.getSequenceName()); int x=1; while(x+this.fragmentSize <= ssr.getSequenceLength()) { // pop on left while(!variantBuffer.isEmpty() && (variantBuffer.getFirst().getStart()+this.fragmentSize*2) < x) { variantBuffer.removeFirst(); } //refill buffer while(vcfIterator.hasNext() ) { //buffer is already by enough if(!variantBuffer.isEmpty() && variantBuffer.getLast().getStart()> x+2*fragmentSize) { break; } final VariantContext ctx = vcfIterator.peek(); if(ctx==null) break; if(ctx.isIndel() || !ctx.isVariant()) { LOG.warning("Cannot handle "+ctx.getReference()+"/"+ctx.getAlternateAlleles()); vcfIterator.next();//consumme continue; } final SAMSequenceRecord variantContig = dict.getSequence( ctx.getContig()); if(variantContig == null) throw new IOException("Unknown contig "+ctx.getContig()); if(variantContig.getSequenceIndex()< ssr.getSequenceIndex()) { throw new IOException("Variants are not ordered on sequence dictionary ("+ctx+")"); } else if(variantContig.getSequenceIndex() > ssr.getSequenceIndex()) { break; } else { variantBuffer.add(vcfIterator.next()); } } if(variantBuffer.isEmpty()) { LOG.info("no variant ?"); //no variant on this chromosome break; } if(x < variantBuffer.getFirst().getStart()-2*fragmentSize) { x= variantBuffer.getFirst().getStart()-2*fragmentSize; } for(int depth =0;depth<1;++depth) { for(String sample:header.getSampleNamesInOrder()) { //loop over genomic strand for(int g_strand=0;g_strand<2;++g_strand) { SAMRecord records[]={ new SAMRecord(samHeader), new SAMRecord(samHeader) }; String readName=String.format("%010d",++id_generator); for(int R=0;R<2;++R) { records[R].setReferenceName(ssr.getSequenceName()); records[R].setReadPairedFlag(true); records[R].setReadName(readName); records[R].setMappingQuality(60); records[R].setProperPairFlag(true); records[R].setMateReferenceName(ssr.getSequenceName()); records[R].setMateUnmappedFlag(false); records[R].setReadUnmappedFlag(false); records[R].setAttribute("RG", sample); records[R].setReadNegativeStrandFlag(R==1); StringBuilder sequence = new StringBuilder( this.readSize); int readLen=0; int refPos1=(R==0?x:x+this.fragmentSize-this.readSize); records[R].setAlignmentStart(refPos1); List<CigarElement> cigarElements = new ArrayList<>( this.readSize); int NM=0; while(readLen< this.readSize) { String base=null; VariantContext overlap=null; for(VariantContext vc:variantBuffer) { int d= vc.getStart() - ( refPos1 + readLen) ; if( d==0) { overlap=vc; break; } if( d> 0) break; } if(overlap!=null) { Genotype genotype = overlap.getGenotype(sample); if(genotype.isCalled() && !genotype.isMixed()) { List<Allele> alleles = genotype.getAlleles(); if(alleles.size()!=2) throw new RuntimeException("Not a diploid organism."); Allele allele=null; if(genotype.isPhased()) { allele =alleles.get(g_strand); } else { allele = alleles.get(Math.random()< 0.5?0:1); } if(allele.isSymbolic()) throw new RuntimeException("Cannot handle symbolic alleles"); if(allele.isReference()) { cigarElements.add(new CigarElement(allele.getBaseString().length(), CigarOperator.EQ)); } else if(allele.getBaseString().length() < overlap.getReference().getBaseString().length()) { cigarElements.add(new CigarElement(allele.getBaseString().length(), CigarOperator.D)); NM++; } else if(allele.getBaseString().length() > overlap.getReference().getBaseString().length()) { cigarElements.add(new CigarElement(allele.getBaseString().length(), CigarOperator.I)); NM++; } else //same size { cigarElements.add(new CigarElement(allele.getBaseString().length(), CigarOperator.X)); NM++; } base=allele.getBaseString().toLowerCase(); } } if(base==null) { base=String.valueOf(genomicSequence.charAt(refPos1-1+readLen)); cigarElements.add(new CigarElement(1, CigarOperator.EQ)); } sequence.append(base); readLen+=base.length(); }//end loop over read-leng while(sequence.length()> this.readSize) sequence.deleteCharAt(sequence.length()-1); records[R].setReadString(sequence.toString()); StringBuilder qual=new StringBuilder(sequence.length()); for(int i=0;i< sequence.length();++i) qual.append("I"); records[R].setBaseQualityString(qual.toString()); //cigar int c=0; while(c+1< cigarElements.size()) { CigarElement c0=cigarElements.get(c); CigarElement c1=cigarElements.get(c+1); if(c0.getOperator().equals(c1.getOperator())) { cigarElements.set(c, new CigarElement(c0.getLength()+c1.getLength(), c0.getOperator())); cigarElements.remove(c+1); } else { ++c; } } records[R].setCigar(new Cigar(cigarElements)); records[R].setAttribute("NM",NM); }//end loop over R1/R2 if(Math.random()<0.5) { records[0].setFirstOfPairFlag(true);records[0].setSecondOfPairFlag(false); records[1].setFirstOfPairFlag(false);records[1].setSecondOfPairFlag(true); } else { records[0].setFirstOfPairFlag(false);records[0].setSecondOfPairFlag(true); records[1].setFirstOfPairFlag(true);records[1].setSecondOfPairFlag(false); } records[0].setMateAlignmentStart( records[1].getAlignmentStart() ); records[1].setMateAlignmentStart( records[0].getAlignmentStart() ); records[0].setInferredInsertSize(records[1].getAlignmentStart() - records[0].getAlignmentStart()); records[1].setInferredInsertSize(records[0].getAlignmentStart() - records[1].getAlignmentStart()); samFileWriter.addAlignment(records[0]); samFileWriter.addAlignment(records[1]); } } } ++x; } } samFileWriter.close(); } @Override public int doWork(List<String> args) { if(this.indexedFastaSequenceFile==null) { LOG.error("No REF defined"); return -1; } if(readSize<=0) { LOG.error("bad read size"); return -1; } if(fragmentSize<=readSize*2) { LOG.error("bad fragment size"); return -1; } VcfIterator iter=null; try { this.indexedFastaSequenceFile= new IndexedFastaSequenceFile(this.faidx); iter = super.openVcfIterator(super.oneFileOrNull(args)); run(iter); return 0; } catch(final Exception err) { LOG.error(err); return -1; } finally { CloserUtil.close(iter); } } public static void main(String[] args) { new VcfToBam().instanceMainWithExit(args); } }