/** * */ package com.github.lindenb.jvarkit.tools.jaspar; import java.io.File; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.util.CloserUtil; import htsjdk.tribble.readers.LineIterator; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import com.beust.jcommander.Parameter; import com.github.lindenb.jvarkit.io.IOUtils; import com.github.lindenb.jvarkit.lang.SubSequence; import com.github.lindenb.jvarkit.util.bio.RevCompCharSequence; import com.github.lindenb.jvarkit.util.jcommander.Launcher; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; import com.github.lindenb.jvarkit.util.picard.GenomicSequence; import com.github.lindenb.jvarkit.util.vcf.VcfIterator; @Program(name="vcfjaspar",description="Finds JASPAR profiles in VCF") public class VcfJaspar extends Launcher { private static final Logger LOG = Logger.build(VcfJaspar.class).make(); @Parameter(names={"-o","--output"},description="Output file. Optional . Default: stdout") private File outputFile = null; @Parameter(names="-J",description=" jaspar PFM uri. required. example: http://jaspar.genereg.net/html/DOWNLOAD/JASPAR_CORE/pfm/nonredundant/pfm_vertebrates.txt",required=true) private String jasparUri=null; @Parameter(names="-f",description="(0<ratio<1) fraction of best score") private double fraction_of_max=0.95; @Parameter(names={"-R","-r","--reference"},description="Indexed fasta reference",required=true) private File fasta = null; private IndexedFastaSequenceFile indexedFastaSequenceFile=null; private List<Matrix> jasparDb=new ArrayList<Matrix>(); public VcfJaspar() { } @Override protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) { final String ATT="JASPAR"; GenomicSequence genomicSequence=null; final VCFHeader header=new VCFHeader(in.getHeader()); addMetaData(header); header.addMetaDataLine(new VCFInfoHeaderLine(ATT, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Jaspar pattern overlapping: Format: (Name|length|Score/1000|pos|strand)" )); out.writeHeader(header); while(in.hasNext()) { VariantContext var=in.next(); if(genomicSequence==null || !genomicSequence.getChrom().equals(var.getContig())) { LOG.info("Loading sequence "+var.getContig()); genomicSequence=new GenomicSequence(this.indexedFastaSequenceFile,var.getContig()); } final Set<String> hits=new HashSet<String>(); for(final Matrix matrix:this.jasparDb) { int start0=Math.max(0, var.getStart() - matrix.length()); for(int y=start0;y<var.getStart() && y+matrix.length() <= genomicSequence.length();++y) { CharSequence forward=new SubSequence(genomicSequence,y,y+matrix.length()); CharSequence revcomp=new RevCompCharSequence(forward); //run each strand for(int strand=0;strand<2;++strand) { double score= matrix.score(strand==0?forward:revcomp); if(score<=0) continue; if(score>= matrix.max()*this.fraction_of_max) { StringBuilder b=new StringBuilder("("); b.append(matrix.getName().replaceAll("[ \t;=]+", "/")); b.append("|"); b.append(matrix.length()); b.append("|"); b.append((int)(1000.0*(score/matrix.max()))); b.append("|"); b.append(y+1); b.append("|"); b.append(strand==0?'+':'-'); b.append(")"); hits.add(b.toString()); break; } } } } if(hits.isEmpty()) { out.add(var); continue; } final VariantContextBuilder vcb=new VariantContextBuilder(var); vcb.attribute(ATT, hits.toArray(new String[hits.size()])); out.add(vcb.make()); } return RETURN_OK; } @Override public int doWork(List<String> args) { if(this.jasparUri==null) { LOG.error("Undefined jaspar-uri"); return -1; } if(this.fasta==null) { LOG.error("Undefined fasta sequence"); return -1; } try { LOG.info("Reading JASPAR: "+jasparUri); LineIterator liter = IOUtils.openURIForLineIterator(this.jasparUri); Iterator<Matrix> miter=Matrix.iterator(liter); while(miter.hasNext()) { Matrix matrix = miter.next(); this.jasparDb.add(matrix.convertToPWM()); } CloserUtil.close(liter); LOG.info("JASPAR size: "+this.jasparDb.size()); if(jasparDb.isEmpty()) { LOG.warn("JASPAR IS EMPTY"); } LOG.info("opening "+fasta); this.indexedFastaSequenceFile=new IndexedFastaSequenceFile(fasta); return doVcfToVcf(oneFileOrNull(args),outputFile); } catch(Exception err) { LOG.error(err); return -1; } finally { CloserUtil.close(this.indexedFastaSequenceFile); this.indexedFastaSequenceFile=null; } } public static void main(String[] args) { new VcfJaspar().instanceMainWithExit(args); } }