/* The MIT License (MIT) Copyright (c) 2014 Pierre Lindenbaum Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. History: * 2015 moving to knime and adding predictions snpeff and VEP * 2014 creation */ package com.github.lindenb.jvarkit.tools.vcffilterjs; import java.io.File; import java.io.FileReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; import javax.script.Bindings; import javax.script.CompiledScript; import com.github.lindenb.jvarkit.lang.JvarkitException; import com.github.lindenb.jvarkit.util.Pedigree; import com.github.lindenb.jvarkit.util.jcommander.Launcher; import com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress; import com.github.lindenb.jvarkit.util.vcf.VcfIterator; import com.github.lindenb.jvarkit.util.vcf.VcfTools; import com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParser; import com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParserFactory; import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser; import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory; import com.github.lindenb.semontology.Term; import com.google.gson.JsonElement; import com.google.gson.JsonParser; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFFilterHeaderLine; import htsjdk.variant.vcf.VCFHeader; /** * Author: Pierre Lindenbaum PhD. @yokofakun * Motivation http://www.biostars.org/p/66319/ */ /** BEGIN_DOC Filters a VCF with javascript ( java Nashorn engine http://www.oracle.com/technetwork/articles/java/jf14-nashorn-2126515.html ) This tool is not safe for a public Galaxy server, because the javascript code can access the filesystem. the script binds the following variables: * variant : the current variation; a org.broadinstitute.variant.variantcontext.VariantContext ( [https://samtools.github.io/htsjdk/javadoc/htsjdk/htsjdk/variant/variantcontext/VariantContext.html](https://samtools.github.io/htsjdk/javadoc/htsjdk/htsjdk/variant/variantcontext/VariantContext.html) ) * header : the VCF header org.broadinstitute.variant.vcf.VCFHeader ( [https://samtools.github.io/htsjdk/javadoc/htsjdk/htsjdk/variant/vcf/VCFHeader.html](https://samtools.github.io/htsjdk/javadoc/htsjdk/htsjdk/variant/vcf/VCFHeader.html) ). if option snpeff is defined: * snpEff (new 2015-02-20 ) a java.util.List of SnpEffPredictionParser$SnpEffPrediction ( https://github.com/lindenb/jvarkit/blob/master/src/main/java/com/github/lindenb/jvarkit/util/vcf/predictions/SnpEffPredictionParser.java ) if option vep is defined: * vep (new 2015-02-20 ) a java.util.List of VepPredictionParser$VepPrediction ( https://github.com/lindenb/jvarkit/blob/master/src/main/java/com/github/lindenb/jvarkit/util/vcf/predictions/VepPredictionParser.java ) . if option casecontrol is defined: * individuals (new 2015-02-20 ) a List<Pedigree.Person>. ### Examples #### Example 1 the file filter.js ``` // prints a VARIATION if two samples at least have a DP<200 function myfilterFunction() { var samples=header.genotypeSamples; var countOkDp=0; for(var i=0; i< samples.size();++i) { var sampleName=samples.get(i); if(! variant.hasGenotype(sampleName)) continue; var genotype = variant.genotypes.get(sampleName); if( ! genotype.hasDP()) continue; var dp= genotype.getDP(); if(dp < 200 ) countOkDp++; } return (countOkDp>2) } myfilterFunction(); ``` ``` $ curl -sL "https://raw.github.com/jamescasbon/PyVCF/master/vcf/test/gatk.vcf" |\ java -jar dist/vcffilterjs.jar -f filter.js ##fileformat=VCFv4.1 ##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"> ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT BLANK NA12878 NA12891 NA12892 NA19238NA19239 NA19240 chr22 42526449 . T A 151.47 . AC=1;AF=0.071;AN=14;BaseQRankSum=2.662;DP=1226;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=41.2083;MQ=240.47;MQ0=0;MQRankSum=0.578;QD=4.89;ReadPosRankSum=3.611 GT:AD:DP:GQ:PL 0/1:23,8:31:99:190,0,694 0/0:188,0:190:99:0,478,5376 0/0:187,0:187:99:0,493,5322 0/0:247,0:249:99:0,634,6728 0/0:185,0:185:99:0,487,5515 0/0:202,0:202:99:0,520,5857 0/0:181,1:182:99:0,440,5362 chr22 42526634 . T C 32.60 . AC=1;AF=0.071;AN=14;BaseQRankSum=1.147;DP=1225;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=50.0151;MQ=240.65;MQ0=0;MQRankSum=1.151;QD=1.30;ReadPosRankSum=1.276 GT:AD:DP:GQ:PL 0/1:21,4:25:71:71,0,702 0/0:187,2:189:99:0,481,6080 0/0:233,0:233:99:0,667,7351 0/0:230,0:230:99:0,667,7394 0/0:174,1:175:99:0,446,5469 0/0:194,2:196:99:0,498,6239 0/0:174,0:175:99:0,511,5894 chr22 42527793 rs1080989 C T 3454.66 . AC=2;AF=0.167;AN=12;BaseQRankSum=-3.007;DB;DP=1074;DS;Dels=0.01;FS=0.000;HRun=1;HaplotypeScore=75.7865;MQ=209.00;MQ0=0;MQRankSum=3.014;QD=9.36;ReadPosRankSum=0.618 GT:AD:DP:GQ:PL ./. 0/1:72,90:162:99:1699,0,1767 0/1:103,96:202:99:1756,0,2532 0/0:188,0:188:99:0,526,5889 0/0:160,0:160:99:0,457,4983 0/0:197,0:198:99:0,544,6100 0/0:156,0:156:99:0,439,5041 ``` #### Example 2 Script used for http://plindenbaum.blogspot.fr/2013/10/yes-choice-of-transcripts-and-software.html ``` function has_missense(v) { if(!v.getClass().getName().equals("java.lang.String")) { var i; for(i=0;i< v.size();++i) { if(has_missense(v.get(i))) return true; } return false; } if(v.indexOf("non_coding_exon_variant")!=-1) return 0; return v.indexOf("missense")!=-1; } function accept(v) { if(v.isIndel()) return 0; var vep=v.getAttribute("CSQ"); if(vep==null ) return 0; var pred=v.getAttribute("PRED"); if(pred==null ) return 0; if(!has_missense(vep) && has_missense(pred)) return 1; return 0; } accept(variant); ``` #### Example 3 Sample having a unique genotype: ``` function accept(ctx) { var x,y,g1,g2,count_same=0; var sampleList=header.getSampleNamesInOrder(); // loop over one sample for(x=0;x < sampleList.size();++x) { g1=ctx.getGenotype( sampleList.get(x) ); // ignore non-called if(! g1.isCalled() ) continue; count_same=0; // loop over the other samples for(y=0;y< sampleList.size() && count_same==0 ;++y) { if(x==y) continue;// same sample ? g2=ctx.getGenotype( sampleList.get(y) ); // ignore non-called if(! g2.isCalled() ) continue; // is g1==g2 ? if( g1.sameGenotype( g2 ) ) { count_same++; } } // found no other same genotype if(count_same==0) return true; } return false; } accept(variant); ``` ``` curl "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr1.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz" |\ gunzip -c |\ java -jar dist/vcffilterjs.jar -f select.js |\ grep -v "#" | cut -f 1-5 1 13957 rs201747181 TC T 1 51914 rs190452223 T G 1 54753 rs143174675 T G 1 55313 rs182462964 A T 1 55326 rs3107975 T C 1 55330 rs185215913 G A 1 55388 rs182711216 C T 1 55416 rs193242050 G A 1 55427 rs183189405 T C 1 62156 rs181864839 C T 1 63276 rs185977555 G A 1 66457 rs13328655 T A 1 69534 rs190717287 T C 1 72148 rs182862337 C T 1 77470 rs192898053 T C 1 79137 rs143777184 A T 1 81949 rs181567186 T C 1 83088 rs186081601 G C 1 83977 rs180759811 A G 1 84346 rs187855973 T C ``` verify: ``` curl "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr1.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz" |\ gunzip -c |\ grep rs201747181 |\ cut -f 10- |\ tr " " "\n" |\ cut -d ':' -f 1 |\ sort |\ uniq -c 1013 0|0 26 0|1 7 1|0 1 1|1 ``` #### Example 4 filter homozygotes for sample NA12878 ``` curl -sL "https://raw.github.com/jamescasbon/PyVCF/master/vcf/test/gatk.vcf" |\ java -jar dist/vcffilterjs.jar -e 'variant.getGenotype("NA12878").isHom()' ``` END_DOC */ import com.beust.jcommander.Parameter; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; @Program( name="vcffilterjs", description="Filtering VCF with javascript expressions", keywords={"vcf","filter","javascript","json","nashorn"}, terms=Term.ID_0000003, biostars={88921,233587,104021,213032,215885,243972,111924,196057,142215,229935,181358,184966,245802,7403,245181, 117974,242281,252580} ) public class VCFFilterJS extends Launcher { private static final Logger LOG = Logger.build(VCFFilterJS.class).make(); private CompiledScript compiledScript = null; @Parameter(names={"-o","--output"},description="Output file. Optional . Default: stdout") private File outputFile = null; @Parameter(names={"-F","--filter"},description="If not empty, variants won't be discarded and this name will be used in the FILTER column") private String filteredTag = ""; @Parameter(names={"-vep","--vep"},description="Decode prediction tag of ensembl VEP") private boolean use_vep = false; @Parameter(names={"-snpeff","--snpeff"},description="Decode prediction tag of SNPEFF") private boolean use_snpeff = false; @Parameter(names={"-casecontrol","--casecontrol"},description="Decode Case-Control injected with VcfInjectPedigree") private boolean use_casecontrol = false; @Parameter(names={"-e","--expression"},description=" (js expression). Optional.") private String scriptExpr=null; @Parameter(names={"-f","--script"},description=" (js file). Optional.") private File scriptFile=null; @Parameter(names={"-json","--json"},description="json files. syntax key=path/to/file.json . Inject the json object parsed with google gson into the javascript context as 'key'") private List<String> jsonFiles=new ArrayList<>(); public VCFFilterJS() { } @Override protected int doVcfToVcf(String inputName, VcfIterator r, VariantContextWriter w) { try { final VCFHeader header = r.getHeader(); final VcfTools vcfTools = new VcfTools(header); final SnpEffPredictionParser snpEffPredictionParser =(this.use_snpeff ? new SnpEffPredictionParserFactory().header(header).get(): null ); final VepPredictionParser vepPredictionParser = (this.use_vep? new VepPredictionParserFactory().header(header).get(): null ); final VCFHeader h2 = new VCFHeader(header); addMetaData(h2); final List<Pedigree.Person> individuals =(this.use_casecontrol? new ArrayList<>(this.getCasesControlsInPedigree(header)): null ); final VCFFilterHeaderLine filterHeaderLine = (filteredTag.trim().isEmpty()?null: new VCFFilterHeaderLine(this.filteredTag.trim(),"Filtered with "+getProgramName()) ); if(filterHeaderLine!=null) h2.addMetaDataLine(filterHeaderLine); final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header); final Bindings bindings = this.compiledScript.getEngine() .createBindings(); bindings.put("header", header); bindings.put("tools", vcfTools); if(this.use_casecontrol) { bindings.put("individuals", individuals); } for(final String jsonkv :this.jsonFiles) { int eq=jsonkv.indexOf("="); if(eq<=0) throw new JvarkitException.UserError("Bad format for json . expected key=/path/to/file.json but got '"+jsonkv+"'"); final String key=jsonkv.substring(0,eq); final FileReader jsonFile = new FileReader(jsonkv.substring(eq+1)); JsonParser jsonParser=new JsonParser(); final JsonElement root=jsonParser.parse(jsonFile); jsonFile.close(); bindings.put(key, root); } w.writeHeader(h2); while (r.hasNext() && !w.checkError()) { final VariantContext variation = progress.watch(r.next()); bindings.put("variant", variation); if(this.use_snpeff) { bindings.put("snpEff", snpEffPredictionParser.getPredictions(variation)); } if(this.use_vep) { bindings.put("vep", vepPredictionParser.getPredictions(variation)); } final Object result = compiledScript.eval(bindings); // result is an array of a collection of variants if(result!=null && (result.getClass().isArray() || (result instanceof Collection))) { final Collection<?> col; if(result.getClass().isArray()) { final Object array[]=(Object[])result; col= Arrays.asList(array); } else { col =( Collection<?>)result; } // write all of variants for(final Object item:col) { if(item==null) throw new JvarkitException.UserError("item in array is null"); if(!(item instanceof VariantContext)) throw new JvarkitException.UserError("item in array is not a VariantContext "+item.getClass()); w.add(VariantContext.class.cast(item)); } } // result is a VariantContext else if(result!=null && (result instanceof VariantContext)) { w.add(VariantContext.class.cast(result)); } else { boolean accept=true; if(result==null) { accept=false; } else if(result instanceof Boolean) { if(Boolean.FALSE.equals(result)) accept = false; } else if(result instanceof Number) { if(((Number)result).intValue()!=1) accept = false; } else { LOG.warn("Script returned something that is not a boolean or a number:"+result.getClass()); accept = false; } if (!accept) { if(filterHeaderLine!=null) { final VariantContextBuilder vcb = new VariantContextBuilder(variation); vcb.filter(filterHeaderLine.getID()); w.add(vcb.make()); } continue; } // set PASS filter if needed if(filterHeaderLine!=null && !variation.isFiltered()) { w.add( new VariantContextBuilder(variation).passFilters().make()); continue; } w.add(variation); } } return RETURN_OK; } catch(final Exception err) { LOG.error(err); return -1; } finally { } } @Override public int doWork(final List<String> args) { try { this.compiledScript = super.compileJavascript(this.scriptExpr,this.scriptFile); return doVcfToVcf(args, this.outputFile); } catch(final Exception err) { LOG.error(err); return -1; } finally { this.compiledScript=null; } } public static void main(String[] args) throws Exception { new VCFFilterJS().instanceMainWithExit(args); } }