package com.github.lindenb.jvarkit.tools.biostar; import java.io.BufferedReader; import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import htsjdk.samtools.util.CloserUtil; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFFormatHeaderLine; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLineType; import com.beust.jcommander.Parameter; import com.github.lindenb.jvarkit.io.IOUtils; import com.github.lindenb.jvarkit.util.jcommander.Launcher; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; import com.github.lindenb.jvarkit.util.vcf.ContigPosRef; import com.github.lindenb.jvarkit.util.vcf.VCFUtils; import com.github.lindenb.jvarkit.util.vcf.VcfIterator; /** BEGIN_DOC ## Example ```bash $ cat reset.txt 20 14370 NA00001 20 1234567 NA00003 20 1110696 NA00002 $ curl "https://raw.github.com/jamescasbon/PyVCF/master/vcf/test/example-4.1.vcf" |\ java -jar dist/biostar86363.jar -G reset.txt ##fileformat=VCFv4.1 ##FILTER=<ID=q10,Description="Quality below 10"> ##FILTER=<ID=s50,Description="Less than 50% of samples have data"> ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> ##FORMAT=<ID=GR,Number=1,Type=Integer,Description="(1) = Genotype was reset by Biostar86363:Set genotype of specific sample/genotype comb to unknown in multisample vcf fi le. See http://www.biostars.org/p/86363/"> ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality"> ##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele"> ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency"> ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129"> ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth"> ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership"> ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data"> ##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x> ##fileDate=20090805 ##phasing=partial ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta ##source=myImputationProgramV3.1 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20 14370 rs6054257 G A 29 PASS AF=0.5;DB;DP=14;H2;NS=3 GT:DP:GQ:GR:HQ .|.:1:48:1:51,51 1|0:8:48:0:51,51 1/1:5:43:0 20 17330 . T A 3 q10 AF=0.017;DP=11;NS=3 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 20 1110696 rs6040355 A G,T 67 PASS AA=T;AF=0.333,0.667;DB;DP=10;NS=2 GT:DP:GQ:GR:HQ 1|2:6:21:0:23,27 .|.:0:2:1:18,2 2/2:4:35:0 20 1230237 . T . 47 PASS AA=T;DP=13;NS=3 GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 20 1234567 microsat1 GTC G,GTCT 50 PASS AA=G;DP=9;NS=3 GT:DP:GQ:GR 0/1:4:35:0 0/2:2:17:0 ./.:3:40:1 ``` END_DOC */ @Program(name="biostar86363", biostars=86363, keywords={"sample","genotype","vcf"}, description="Set genotype of specific sample/genotype comb to unknown in multisample vcf file. See http://www.biostars.org/p/86363/") public class Biostar86363 extends Launcher { private static final Logger LOG = Logger.build(Biostar86363.class).make(); @Parameter(names={"-o","--output"},description="Output file. Optional . Default: stdout") private File outputFile = null; private Map<ContigPosRef,Set<String>> pos2sample=new HashMap<ContigPosRef, Set<String>>(); private Biostar86363() { } @Parameter(names="-G",description="genotypes to reset. Format :CHROM(tab)POS(tab)ref(tab)SAMPLE. REQUIRED.",required=true) private File genotypeFile=null; @Override protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) { final List<Allele> empty_g=new ArrayList<Allele>(2); empty_g.add(Allele.NO_CALL); empty_g.add(Allele.NO_CALL); VCFHeader h=in.getHeader(); final List<String> vcf_samples=h.getSampleNamesInOrder(); h.addMetaDataLine(new VCFFormatHeaderLine("GR", 1, VCFHeaderLineType.Integer, "(1) = Genotype was reset by "+getProgramName())); out.writeHeader(h); while(in.hasNext()) { VariantContext ctx=in.next(); ContigPosRef cap=new ContigPosRef(ctx); Set<String> samplesToReset=this.pos2sample.get(cap); if(samplesToReset!=null) { VariantContextBuilder vcb=new VariantContextBuilder(ctx); List<Genotype> genotypes=new ArrayList<Genotype>(); for(String sample:vcf_samples) { Genotype g=ctx.getGenotype(sample); if(g==null) continue; GenotypeBuilder gb=new GenotypeBuilder(g); if(samplesToReset.contains(sample)) { gb.alleles(empty_g); gb.attribute("GR",1); } else { gb.attribute("GR",0); } g=gb.make(); genotypes.add(g); } vcb.genotypes(genotypes); ctx=VCFUtils.recalculateAttributes(vcb.make()); } out.add(ctx); } return 0; } @Override public int doWork(List<String> args) { if(genotypeFile==null) { LOG.error("undefined genotype file"); return -1; } BufferedReader in=null; try { Pattern tab=Pattern.compile("[\t]"); in=IOUtils.openFileForBufferedReading(this.genotypeFile); String line; while((line=in.readLine())!=null) { if(line.isEmpty() || line.startsWith("#")) continue; String tokens[]=tab.split(line); if(tokens.length<4) { LOG.error("Bad line in "+line); in.close(); in=null; return -1; } ContigPosRef cap=new ContigPosRef( tokens[0], Integer.parseInt(tokens[1]), Allele.create(tokens[2],true) ); Set<String> samples=this.pos2sample.get(cap); if(samples==null) { samples=new HashSet<String>(); this.pos2sample.put(cap,samples); } samples.add(tokens[3]); } in.close(); return doVcfToVcf(args, outputFile); } catch(Exception err) { LOG.error(err); return -1; } finally { CloserUtil.close(in); } } /** * @param args */ public static void main(String[] args) { new Biostar86363().instanceMainWithExit(args); } }