/* The MIT License (MIT) Copyright (c) 2014 Pierre Lindenbaum Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. History: * 2014 creation */ package com.github.lindenb.jvarkit.tools.vcfrebase; import java.io.File; import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.Set; import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.RuntimeIOException; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import com.beust.jcommander.Parameter; import com.beust.jcommander.ParametersDelegate; import com.github.lindenb.jvarkit.lang.JvarkitException; import com.github.lindenb.jvarkit.util.bio.AcidNucleics; import com.github.lindenb.jvarkit.util.bio.Rebase; import com.github.lindenb.jvarkit.util.log.Logger; import com.github.lindenb.jvarkit.util.picard.GenomicSequence; import com.github.lindenb.jvarkit.util.vcf.DelegateVariantContextWriter; import com.github.lindenb.jvarkit.util.vcf.VCFUtils; import com.github.lindenb.jvarkit.util.vcf.VcfIterator; /** BEGIN_DOC ## Example ``` $ curl -s "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz" |\ gunzip -c |\ java -jar dist/vcfrebase.jar -R human_g1k_v37.fasta -w 8 |\ grep -E '(#|ENZ=)' ##fileformat=VCFv4.1 ##INFO=<ID=ENZ,Number=.,Type=String,Description="Enzyme overlapping: Format: (Name,Site,Sequence,pos-1,strand)"> (...) #CHROM POS ID REF ALT QUAL FILTER INFO 1 256138 rs372711491 T G . . ENZ=(SwaI|ATTT^AAAT|ATTTAAAT|256139|+),(SwaI|ATTT^AAAT|ATTTAAAt|256131|+);OTHERKG;RS=372711491;RSPOS=256138;SAO=0;SSR=0;VC=SNV;VP=0x050000000001000002000100;WGT=1;dbSNPBuildID=138 1 744076 rs142181107 A C . . CAF=[0.9775,0.0225];COMMON=1;ENZ=(SwaI|ATTT^AAAT|ATTTAAaT|744070|+);KGPROD;KGPhase1;RS=142181107;RSPOS=744076;SAO=0;SSR=0;VC=SNV;VP=0x050000000001100014000100;WGT=1;dbSNPBuildID=134 1 762592 rs71507462 C G . . CAF=[0.2755,0.7245];COMMON=1;ENZ=(MauBI|CG^CGCGCG|cGCGCGCG|762592|+);GNO;KGPROD;KGPhase1;OTHERKG;R5;RS=71507462;RSPOS=762592;SAO=0;SLO;SSR=0;VC=SNV;VP=0x050100020001100116000100;WGT=1;dbSNPBuildID=130 1 780347 rs202219272 TTTAA T . . CAF=[0.9867,0.01331];COMMON=1;ENZ=(PacI|TTAAT^TAA|ttaaTTAA|780348|+);INT;KGPROD;KGPhase1;KGPilot123;OTHERKG;RS=202219272;RSPOS=780348;SAO=0;SSR=0;VC=DIV;VP=0x05000008000110001e000200;WGT=1;dbSNPBuildID=137 1 780361 rs111333032 A T . . ENZ=(SwaI|ATTT^AAAT|ATTTAAaT|780355|+);GNO;INT;OTHERKG;RS=111333032;RSPOS=780361;SAO=0;SLO;SSR=0;VC=SNV;VP=0x050100080001000102000100;WGT=1;dbSNPBuildID=132 1 786891 rs183914415 C T . . CAF=[0.9752,0.02479];COMMON=1;ENZ=(AscI|GG^CGCGCC|GGCGCGCC|786892|+);INT;KGPROD;KGPhase1;RS=183914415;RSPOS=786891;SAO=0;SSR=0;VC=SNV;VP=0x050000080001100014000100;WGT=1;dbSNPBuildID=135 1 820332 rs201213736 A G,T . . ENZ=(SwaI|ATTT^AAAT|ATTTAAaT|820326|+);OTHERKG;RS=201213736;RSPOS=820332;SAO=0;SSR=0;VC=SNV;VP=0x050000000001000002000100;WGT=1;dbSNPBuildID=137 1 820333 rs201439577 T C . . ENZ=(SwaI|ATTT^AAAT|ATTTAAAt|820326|+);OTHERKG;RS=201439577;RSPOS=820333;SAO=0;SSR=0;VC=SNV;VP=0x050000000001000002000100;WGT=1;dbSNPBuildID=137 1 822560 rs200342299 C T . . ENZ=(SfiI|GGCCNNNN^NGGCC|GGCCAGcTTGGCC|822554|+);OTHERKG;RS=200342299;RSPOS=822560;SAO=0;SSR=0;VC=SNV;VP=0x050000000001000002000100;WGT=1;dbSNPBuildID=137 1 822565 rs111427246 C T . . ENZ=(SfiI|GGCCNNNN^NGGCC|GGCCAGCTTGGcC|822554|+);GNO;OTHERKG;RS=111427246;RSPOS=822565;SAO=0;SLO;SSR=0;VC=SNV;VP=0x050100000001000102000100;WGT=1;dbSNPBuildID=132 1 837753 rs187865648 G A . . CAF=[0.9995,0.0004591];COMMON=0;ENZ=(SfiI|GGCCNNNN^NGGCC|gGCCATTCTGGCC|837753|+);KGPROD;KGPhase1;RS=187865648;RSPOS=837753;SAO=0;SSR=0;VC=SNV;VP=0x050000000001000014000100;WGT=1;dbSNPBuildID=135 1 839873 rs192553893 C T . . CAF=[0.6768,0.3232];COMMON=1;ENZ=(NotI|GC^GGCCGC|GcGGCCGC|839872|+);KGPROD;KGPhase1;OTHERKG;RS=192553893;RSPOS=839873;SAO=0;SSR=0;VC=SNV;VP=0x050000000001100016000100;WGT=1;dbSNPBuildID=135 1 839911 rs76652930 C T . . ASP;ENZ=(NotI|GC^GGCCGC|GcGGCCGC|839910|+);GNO;OTHERKG;RS=76652930;RSPOS=839911;SAO=0;SSR=0;VC=SNV;VP=0x050000000005000102000100;WGT=1;dbSNPBuildID=131 1 839912 rs369394889 GGCC G . . ENZ=(NotI|GC^GGCCGC|GCggccGC|839910|+);OTHERKG;RS=369394889;RSPOS=839913;SAO=0;SSR=0;VC=DIV;VP=0x050000000001000002000200;WGT=1;dbSNPBuildID=138 1 839933 rs146045242 C T . . CAF=[0.8852,0.1148];COMMON=1;ENZ=(NotI|GC^GGCCGC|GcGGCCGC|839932|+);KGPROD;KGPhase1;OTHERKG;RS=146045242;RSPOS=839933;SAO=0;SSR=0;VC=SNV;VP=0x050000000001100016000100;WGT=1;dbSNPBuildID=134 1 840009 rs140080750 C T . . CAF=[0.9692,0.03076];COMMON=1;ENZ=(NotI|GC^GGCCGC|GcGGCCGC|840008|+);KGPROD;KGPhase1;OTHERKG;RS=140080750;RSPOS=840009;SAO=0;SSR=0;VC=SNV;VP=0x050000000001100016000100;WGT=1;dbSNPBuildID=134 ``` END_DOC */ public class VcfRebase { private static final Logger LOG = Logger.build().prefix("vcfRebase").make(); private IndexedFastaSequenceFile indexedFastaSequenceFile=null; private Rebase rebase=Rebase.createDefaultRebase(); @Parameter(names={"-A","--attribute"},description="VCF INFO attribute") private String ATT="ENZ"; public VcfRebase() { } public VcfRebase indexedFastaSequenceFile(final IndexedFastaSequenceFile ref) { this.indexedFastaSequenceFile=ref; return this; } public VcfRebase tag(final String tag) { this.ATT=tag; return this; } public VcfRebase rebase(final Rebase reb) { this.rebase=reb; return this; } public VariantContextWriter open(final VariantContextWriter delegate) { if(this.indexedFastaSequenceFile==null) throw new JvarkitException.UserError("Indexed fasta file was not provided."); if(this.rebase==null) throw new JvarkitException.UserError("Rebase missing."); return new Worker(delegate, this.indexedFastaSequenceFile, this.rebase, this.ATT ); } private static class Worker extends DelegateVariantContextWriter { private final String ATT; private GenomicSequence genomicSequence=null; private final IndexedFastaSequenceFile indexedFastaSequenceFile; private final Rebase rebase; Worker( final VariantContextWriter delegate, final IndexedFastaSequenceFile indexedFastaSequenceFile, final Rebase rebase, final String att ) { super(delegate); this.indexedFastaSequenceFile=indexedFastaSequenceFile; this.rebase=rebase; this.ATT=att; } @Override public void writeHeader(final VCFHeader header) { final VCFHeader header2=new VCFHeader(header); header2.addMetaDataLine(new VCFInfoHeaderLine(ATT, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Enzyme overlapping: Format: (Name,Site,Sequence,pos-1,strand)")); super.writeHeader(header2); } @Override public void add(VariantContext var) { if(genomicSequence==null || !genomicSequence.getChrom().equals(var.getContig())) { LOG.info("Current contig "+var.getContig()); this.genomicSequence=new GenomicSequence(this.indexedFastaSequenceFile,var.getContig()); } final Set<String> hits=new HashSet<String>(); for(final Rebase.Enzyme enz:this.rebase) { int start0=Math.max(0, var.getStart() - enz.size()); for(int y=start0;y<=var.getStart();++y) { //run each strand for(int strand=0;strand<2;++strand) { int x=0; //loop over bases of the enzyme for(x=0;x< enz.size() && y+x < this.genomicSequence.length() ;++x ) { char c=(strand==0? enz.at(x): AcidNucleics.complement(enz.at((enz.size()-1)-x)) ); if(!Rebase.compatible(genomicSequence.charAt(y+x),c)) break; } // match found if(x==enz.size()) { StringBuilder b=new StringBuilder("("); b.append(enz.getName()); b.append("|"); b.append(enz.getDecl()); b.append("|"); for(x=0;x < enz.size();++x) { char c=genomicSequence.charAt(y+x); if(y+x>=var.getStart()-1 && y+x<=var.getEnd()-1) { c=Character.toLowerCase(c); } b.append(c); } b.append("|"); b.append(y+1); b.append("|"); b.append(strand==0?"+":"-"); b.append(")"); hits.add(b.toString()); break; } if(enz.isPalindromic()) break; } } } if(hits.isEmpty()) { super.add(var); } else { VariantContextBuilder vcb=new VariantContextBuilder(var); vcb.attribute(ATT, hits.toArray(new String[hits.size()])); super.add(vcb.make()); } } } public static class Launcher extends com.github.lindenb.jvarkit.util.jcommander.Launcher { @ParametersDelegate private VcfRebase instance=new VcfRebase(); @Parameter(names={"-R","-reference"},description="Reference fasta file") File referenceFile; @Parameter(names={"-E","-enzyme"},description="restrict to that enzyme name") private Set<String> selEnzymesStr = new HashSet<>(); @Parameter(names={"-w","-weight"},description="min enzyme weight") private float weight= 5f; @Parameter(names={"-o","--out"},required=false,description="Output vcf , ot stdin") private VariantContextWriter out=new VcfWriterOnDemand(); @Override public int doWork(List<String> args) { Rebase rebase=Rebase.createDefaultRebase(); if(!this.selEnzymesStr.isEmpty()) { Rebase rebase2=new Rebase(); for(String e:selEnzymesStr) { if(e.isEmpty()) continue; Rebase.Enzyme enz=rebase.getEnzymeByName(e); if(enz==null) { StringBuilder msg= new StringBuilder(); msg.append("Cannot find enzyme \""+e +"\" in RE list.\n"); msg.append("Current list is:\n"); for(Rebase.Enzyme E: rebase) { msg.append("\t"+E+"\n"); } throw new JvarkitException.UserError(msg.toString()); } rebase2.getEnzymes().add(enz); } rebase=rebase2; } int i=0; while(i< rebase.size()) { if(rebase.get(i).getWeight()< weight) { rebase.getEnzymes().remove(i); } else { ++i; } } if(rebase.size()==0) { LOG.warn("REBASE IS EMPTY"); } if(referenceFile==null) { throw new JvarkitException.ReferenceMissing("reference.undefined"); } IOUtil.assertFileIsReadable(referenceFile); IndexedFastaSequenceFile indexedFastaSequenceFile=null; VcfIterator in=null; VariantContextWriter out=null; try { this.instance. rebase(rebase). indexedFastaSequenceFile(indexedFastaSequenceFile); indexedFastaSequenceFile=new IndexedFastaSequenceFile(this.referenceFile); in = VCFUtils.createVcfIterator(super.oneFileOrNull(args)); out=instance.open(this.out); VCFUtils.copyHeaderAndVariantsTo(in,out); in.close(); out.close(); } catch(final IOException err) { throw new RuntimeIOException(err); } finally { CloserUtil.close(indexedFastaSequenceFile); CloserUtil.close(out); CloserUtil.close(in); } return super.doWork(args); } } public static void main(String[] args) { new VcfRebase.Launcher().instanceMainWithExit(args); } }