/** * */ package com.github.lindenb.jvarkit.tools.vcfannobam; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; import com.github.lindenb.jvarkit.util.jcommander.Launcher; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; import com.github.lindenb.jvarkit.util.picard.SamFileReaderFactory; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.IntervalTreeMap; import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SamReader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.util.IntervalList; import htsjdk.samtools.util.SequenceUtil; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import com.beust.jcommander.Parameter; import com.github.lindenb.jvarkit.io.IOUtils; import com.github.lindenb.jvarkit.tools.vcf2rdf.VcfToRdf; import com.github.lindenb.jvarkit.util.vcf.VcfIterator; @Program(name="vcfannobam", deprecatedMsg="useless: use DP/DP4 in the Genotypes", description="Annotate a VCF with the Coverage statistics of a BAM file+ BED file of capture. It uses the Cigar string instead of the start/end to get the voverage") public class VCFAnnoBam extends Launcher { private static final Logger LOG = Logger.build(VcfToRdf.class).make(); @Parameter(names={"-o","--output"},description="Output file. Optional . Default: stdout") private File outputFile = null; @Parameter(names= {"-BED"}, description="BED File capture.",required=true) private File BEDILE=null; @Parameter(names= {"-BAM"}, description="indexed BAM File.") private List<File> BAMFILE=null; @Parameter(names= {"-MIN_MAPING_QUALITY"}, description="min mapping quality") public int MMQ=0; @Parameter(names= {"-MIN_COV"}, description="min coverage to say the position is not covered") public int MIN_COVERAGE=0; private class Rgn { Interval interval; int count_no_coverage=0; double mean=0; double min=0.0; double max=-1; int percent_covered=0; boolean processed=false; @Override public String toString() { StringBuilder b=new StringBuilder(); b.append(interval.getStart()); b.append('|'); b.append(interval.getEnd()); b.append('|'); b.append(String.format("%.2f",mean)); b.append('|'); b.append(min); b.append('|'); b.append(max); b.append('|'); b.append((interval.getEnd()-interval.getStart()+1)); b.append('|'); b.append(count_no_coverage); b.append('|'); b.append(percent_covered); return b.toString(); } } private void process(Rgn rgn,List<SamReader> samReaders) { rgn.processed=true; int chromStart1= rgn.interval.getStart(); int chromEnd1= rgn.interval.getEnd(); int counts[]=new int[chromEnd1-chromStart1+1]; if(counts.length==0) return; Arrays.fill(counts, 0); for(SamReader samReader:samReaders) { /** * start - 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence. * end - 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence. */ SAMRecordIterator r=samReader.queryOverlapping(rgn.interval.getContig(), chromStart1, chromEnd1); while(r.hasNext()) { SAMRecord rec=r.next(); if(rec.getReadUnmappedFlag()) continue; if(rec.getReadFailsVendorQualityCheckFlag()) continue; if(rec.getDuplicateReadFlag()) continue; if(!rec.getReferenceName().equals(rgn.interval.getContig())) continue; if(rec.getMappingQuality()==255 && rec.getMappingQuality()< this.MMQ) { continue; } Cigar cigar=rec.getCigar(); if(cigar==null) continue; int refpos1=rec.getAlignmentStart(); for(CigarElement ce:cigar.getCigarElements()) { switch(ce.getOperator()) { case H:break; case S:break; case I:break; case P:break; case N:// reference skip case D://deletion in reference { refpos1+=ce.getLength(); break; } case M: case EQ: case X: { for(int i=0;i< ce.getLength() && refpos1<= chromEnd1;++i) { if(refpos1>= chromStart1 && refpos1<=chromEnd1) { counts[refpos1-chromStart1]++; } refpos1++; } break; } default: throw new IllegalStateException( "Doesn't know how to handle cigar operator:"+ce.getOperator()+ " cigar:"+cigar ); } } } r.close(); } Arrays.sort(counts); for(int cov:counts) { if(cov<=MIN_COVERAGE) rgn.count_no_coverage++; rgn.mean+=cov; } rgn.mean/=counts.length; rgn.min=counts[0]; rgn.max=counts[counts.length-1]; rgn.percent_covered=(int)(((counts.length-rgn.count_no_coverage)/(double)counts.length)*100.0); rgn.processed=true; } @Override protected int doVcfToVcf(String inputName, VcfIterator r, VariantContextWriter w) { BufferedReader bedIn=null; List<SamReader> samReaders=new ArrayList<SamReader>(); IntervalTreeMap<Rgn> capture=new IntervalTreeMap<Rgn>(); try { SAMFileHeader firstHeader=null; for(File samFile:new HashSet<File>(BAMFILE)) { LOG.info("open bam "+samFile); SamReader samReader = SamFileReaderFactory.mewInstance().open(samFile); SAMFileHeader samHeader=samReader.getFileHeader(); samReaders.add(samReader); if(firstHeader==null) { firstHeader=samHeader; } else if(!SequenceUtil.areSequenceDictionariesEqual( firstHeader.getSequenceDictionary(), samHeader.getSequenceDictionary()) ) { throw new RuntimeException("some same seq dir are incompatibles"); } } IntervalList intervalList=new IntervalList(firstHeader); Pattern tab=Pattern.compile("[\t]"); LOG.info("read bed "+BEDILE); bedIn=IOUtils.openFileForBufferedReading(BEDILE); String line; while((line=bedIn.readLine())!=null) { if(line.isEmpty() || line.startsWith("#")) continue; String tokens[]=tab.split(line,5); if(tokens.length<3) throw new IOException("bad bed line in "+line+" "+this.BEDILE); if(firstHeader.getSequenceDictionary().getSequence(tokens[0])==null) { LOG.error("error in BED +"+BEDILE+" : "+line+" chromosome is not in sequence dict of "+BAMFILE); continue; } int chromStart1= 1+Integer.parseInt(tokens[1]);//add one int chromEnd1= Integer.parseInt(tokens[2]); Interval interval=new Interval(tokens[0], chromStart1, chromEnd1); intervalList.add(interval); } bedIn.close(); bedIn=null; intervalList=intervalList.sorted(); for(Interval interval:intervalList.uniqued()) { Rgn rgn=new Rgn(); rgn.interval=interval; capture.put(rgn.interval, rgn); } intervalList=null; VCFHeader header=r.getHeader(); VCFHeader h2=new VCFHeader(header.getMetaDataInInputOrder(),header.getSampleNamesInOrder()); h2.addMetaDataLine(new VCFInfoHeaderLine( "CAPTURE", 1, VCFHeaderLineType.String, "Capture stats: Format is (start|end|mean|min|max|length|not_covered|percent_covered) BAM files: "+BAMFILE+" CAPTURE:"+BEDILE)); w.writeHeader(h2); while(r.hasNext()) { VariantContext ctx=r.next(); Interval interval=new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd()); Collection<Rgn> rgns=capture.getOverlapping(interval); Iterator<Rgn> it=rgns.iterator(); if(!it.hasNext()) { w.add(ctx); continue; } Rgn rgn=it.next(); if(!rgn.processed) { //LOG.info("processing "+rgn.interval); process(rgn,samReaders); } VariantContextBuilder b=new VariantContextBuilder(ctx); b.attribute("CAPTURE", rgn.toString()); w.add(b.make()); } return 0; } catch(final Exception err) { LOG.error(err); return -1; } finally { for(SamReader samReader:samReaders) CloserUtil.close(samReader); } } @Override public int doWork(List<String> args) { return doVcfToVcf(args, outputFile); } /** * @param args */ public static void main(String[] args) { new VCFAnnoBam().instanceMainWithExit(args); } }