package com.github.lindenb.jvarkit.tools.gatk.variants; import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.gatk.engine.filters.MalformedReadFilter; import org.broadinstitute.gatk.engine.filters.ReadFilter; import org.broadinstitute.gatk.engine.walkers.Allows; import org.broadinstitute.gatk.engine.walkers.By; import org.broadinstitute.gatk.engine.walkers.DataSource; import org.broadinstitute.gatk.engine.walkers.Downsample; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.Requires; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; import org.broadinstitute.gatk.engine.walkers.Window; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.downsampling.DownsampleType; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFCompoundHeaderLine; import htsjdk.variant.vcf.VCFFilterHeaderLine; import htsjdk.variant.vcf.VCFFormatHeaderLine; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineType; @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @Requires(value={}) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @Reference(window=@Window(start=-50,stop=50)) @Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=1000000) @By(DataSource.REFERENCE) public class SoftClipAnnotator extends RodWalker<Integer, Integer> implements TreeReducible<Integer> { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @Output(doc="File to which variants should be written") protected VariantContextWriter vcfWriter = null; @Input(doc="sam Readers. We don't use the standard GATK '-I' option because: https://github.com/broadinstitute/gatk-protected/issues/891 ",fullName="bams",shortName="bams",required=false) protected List<File> samFilenames=new ArrayList<>(); @Argument(doc="Extend read location. see https://github.com/broadinstitute/gatk-protected/issues/891",shortName="xclip",fullName="extendclip",required=false) protected int extend4clip=200; private List<SamReader> samReaders=new ArrayList<>(); private Map<String,List<SamReader>> sample2bam=new HashMap<>(); private final VCFFormatHeaderLine numClipInGenotypeFormatHeaderLine = new VCFFormatHeaderLine("NSOFTCLIP",1,VCFHeaderLineType.Integer,"Number of reads containing a soft clipped base at this positions"); private final VCFFilterHeaderLine filterHaveClipInVariant = new VCFFilterHeaderLine("SOFT_CLIP_IN_VARIANT","All called and non-homref genotypes contain soft-clipped reads"); public void initialize() { final Set<VCFHeaderLine> hInfo = new HashSet<>(); final List<String> rodName = Arrays.asList(variantCollection.variants.getName()); final Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); for(final String sample:samples) { this.sample2bam.put(sample, new ArrayList<>()); } for ( final VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(),rodName) ) { if ( isUniqueHeaderLine(line, hInfo) ) hInfo.add(line); } VCFHeader vcfHeader = new VCFHeader(hInfo, samples); VCFHeader header2=new VCFHeader(vcfHeader); header2.addMetaDataLine(this.numClipInGenotypeFormatHeaderLine); header2.addMetaDataLine(this.filterHaveClipInVariant); vcfWriter.writeHeader(header2); final SamReaderFactory srf = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.LENIENT); for(final File samFilename:IOUtil.unrollFiles(this.samFilenames,".bam")) { logger.info("opening "+samFilename); final SamReader r=srf.open(samFilename); final Set<String> sampleset= new HashSet<>(); for(final SAMReadGroupRecord g:r.getFileHeader().getReadGroups()) { if(g.getSample()==null || !this.sample2bam.containsKey(g.getSample())) continue; sampleset.add(g.getSample()); } if(sampleset.isEmpty()) { logger.info("no VCF sample in "+samFilename); CloserUtil.close(r); continue; } this.samReaders.add(r); for(final String sample:sampleset) { if(!this.sample2bam.containsKey(sample)) continue; this.sample2bam.get(sample).add(r); } } } @Override public boolean includeReadsWithDeletionAtLoci() { return true; } public Integer map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { if(tracker==null) return 0; final Collection<VariantContext> VCs = tracker.getValues(variantCollection.variants, context.getLocation()); if ( VCs.isEmpty() ) { return 0; } final Collection<ReadFilter> readFilters = super.getToolkit().getFilters(); for ( final VariantContext ctx : VCs ) { int observed_genotypes=0; int num_some_clipped_genotypes=0; final List<Genotype> genotypes=new ArrayList<>(ctx.getNSamples()); for(int i=0; i< ctx.getNSamples();++i) { final Genotype g=ctx.getGenotype(i); if(g.isNoCall() || g.isHomRef()) { genotypes.add(g); continue; } final List<SamReader> bamReaderForSample = this.sample2bam.get(g.getSampleName()); if(bamReaderForSample.isEmpty()) { genotypes.add(g); continue; } observed_genotypes++; int numberOfClipsOverlapping=0; for(final SamReader samReader: bamReaderForSample) { final SAMRecordIterator iter= samReader.query( ctx.getContig(), Math.max(0, ctx.getStart()-extend4clip), ctx.getEnd()+extend4clip , false); while(iter.hasNext()) { final SAMRecord samRecord = iter.next(); if(samRecord.getReadUnmappedFlag() || samRecord.getCigar()==null || !samRecord.getContig().equals(ctx.getContig()) || samRecord.getUnclippedEnd() < ctx.getStart() || samRecord.getUnclippedStart() > ctx.getEnd() || samRecord.getReadGroup()==null || !g.getSampleName().equals(samRecord.getReadGroup().getSample()) ) continue; boolean ok=true; for(final ReadFilter readFilter:readFilters) { //this one cannot't be correctly initialized... if(readFilter instanceof MalformedReadFilter) continue; if(readFilter.filterOut(samRecord)) { ok=false; break; } } if(!ok) continue; int refPos=samRecord.getUnclippedStart(); for(final CigarElement ce:samRecord.getCigar()) { if( ce.getOperator().consumesReferenceBases() || ce.getOperator().isClipping()) { final int refEnd = refPos+ce.getLength(); if(!(ctx.getEnd() < refPos || refEnd < ctx.getStart())) { //System.err.println("IN "+ce+" "+ctx.getStart()+"-"+ctx.getEnd()+" : "+refPos+"-"+refPos+ce.getLength()); if(ce.getOperator().equals(CigarOperator.S)) { ++numberOfClipsOverlapping; } } refPos+=ce.getLength(); } if(refPos> ctx.getEnd()) break; } } iter.close(); }//end of loop over SamRecord if(numberOfClipsOverlapping==0) { genotypes.add(g); } else { GenotypeBuilder gb=new GenotypeBuilder(g); gb.attribute(numClipInGenotypeFormatHeaderLine.getID(), numberOfClipsOverlapping); genotypes.add(gb.make()); num_some_clipped_genotypes++; } }/* end loop oversam Reader */ if(num_some_clipped_genotypes==0) { vcfWriter.add(ctx); } else { final VariantContextBuilder vcb=new VariantContextBuilder(ctx); vcb.genotypes(genotypes); if(observed_genotypes==num_some_clipped_genotypes) { vcb.filter(this.filterHaveClipInVariant.getID()); } vcfWriter.add(vcb.make()); } } return VCs.size(); } @Override public Integer reduceInit() { return 0; } @Override public Integer reduce(Integer value, Integer sum) { return value + sum; } @Override public Integer treeReduce(Integer lhs, Integer rhs) { return lhs + rhs; } /** * Tell the user the number of loci processed and close out the new variants file. * * @param result the number of loci seen. */ public void onTraversalDone(Integer result) { for(final SamReader samReader:this.samReaders) { CloserUtil.close(samReader); } logger.info("Processed " + result + " loci.\n"); } private static boolean isUniqueHeaderLine(VCFHeaderLine line, Set<VCFHeaderLine> currentSet) { if ( !(line instanceof VCFCompoundHeaderLine) ) return true; for ( VCFHeaderLine hLine : currentSet ) { if ( hLine instanceof VCFCompoundHeaderLine && ((VCFCompoundHeaderLine)line).sameLineTypeAndName((VCFCompoundHeaderLine)hLine) ) return false; } return true; } }