package com.github.lindenb.jvarkit.tools.gatk.variants; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.OptionalInt; import java.util.Set; import java.util.function.Predicate; import java.util.function.ToIntFunction; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.report.GATKReportTable; import com.github.lindenb.jvarkit.gatk.Category; import com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextUtils; /** * * * * * */ @DocumentedGATKFeature( summary="Reads a VCF file and creates a variant summary table", groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class GroupByVariants extends AbstractGroupBy { @Argument(fullName="minQuality",shortName="mq",required=false,doc="Group by Quality. Set the treshold for Minimum Quality") public double minQuality = -1; @Argument(fullName="chrom",shortName="chrom",required=false,doc="Group by Chromosome/Contig") public boolean bychrom = false; @Argument(fullName="ID",shortName="ID",required=false,doc="Group by ID") public boolean byID = false; @Argument(fullName="variantType",shortName="variantType",required=false,doc="Group by VariantType") public boolean byType = false; @Argument(fullName="filter",shortName="filter",required=false,doc="Group by FILTER") public boolean byFilter = false; @Argument(fullName="impact",shortName="impact",required=false,doc="Group by ANN/IMPACT") public boolean byImpact = false; @Argument(fullName="biotype",shortName="biotype",required=false,doc="Group by ANN/biotype") public boolean bybiotype = false; @Argument(fullName="nalts",shortName="nalts",required=false,doc="Group by number of ALTS") public boolean bynalts = false; @Argument(fullName="affected",shortName="affected",required=false,doc="Group by number of Samples called and not HOMREF") public boolean byAffected = false; @Argument(fullName="called",shortName="called",required=false,doc="Group by number of Samples called") public boolean byCalled = false; @Argument(fullName="maxSamples",shortName="maxSamples",required=false,doc="if the number of samples affected is greater than --maxSamples use the label \"GT_MAX_SAMPLES\"") public int maxSamples = Integer.MAX_VALUE; @Argument(fullName="tsv",shortName="tsv",required=false,doc="Group by Transition/Transversion") public boolean byTsv = false; @Argument(fullName="allelesize",shortName="allelesize",required=false,doc="Group by Max(allele.size)") public boolean byAlleleSize = false; @Argument(fullName="allelefrequency",shortName="af",required=false,doc="Group by Allele Frequency using the AF attribute. The lowest AF is used for multiallelic variants.") public boolean byAlleFrequency = false; @Argument(fullName="depth",shortName="dp",required=false,doc="Group by Depth using the DP attribute") public boolean byDepth = false; @Argument(fullName="attribute",shortName="attribute",required=false,doc="Search for the presence (true/false) of an attribute in the INFO format. For example using GATK.VariantAnnotator and -comp") public Set<String> presence_of_attributes = new HashSet<>(); public boolean bySingleton = false; @Argument(fullName="singleton",shortName="singleton",required=false,doc="Singleton variants") private final DoubleRangeClassifier groupByAfClassifier = new DoubleRangeClassifier( Arrays.asList(new DoubleRangeClassifier.Window(0.0, 0.01),new DoubleRangeClassifier.Window(0.1, 0.1)) , 1.0, new DecimalFormat("#.##")); private final IntRangeClassifier groupByDpClassifier = new IntRangeClassifier( Arrays.asList( new IntRangeClassifier.Window(0,1), new IntRangeClassifier.Window(10,5), new IntRangeClassifier.Window(50,10), new IntRangeClassifier.Window(100,100)) , 1000,NumberFormat.getIntegerInstance()); private final IntRangeClassifier alleleSizeClassifier = new IntRangeClassifier( Arrays.asList( new IntRangeClassifier.Window(0,1), new IntRangeClassifier.Window(10,10), new IntRangeClassifier.Window(100,100), new IntRangeClassifier.Window(1000,1000) ) , 5000,NumberFormat.getIntegerInstance()); @Override public Map<Category,Long> map( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { if ( tracker == null )return Collections.emptyMap(); final Map<Category,Long> count = new HashMap<>(); for(final VariantContext ctx: tracker.getValues(this.variants,context.getLocation())) { final List<Object> labels=new ArrayList<>(); if(bychrom) labels.add(ctx.getContig()); if(byID) labels.add(ctx.hasID()); if(byType) labels.add(ctx.getType().name()); if(byFilter) labels.add(ctx.isFiltered()); if(minQuality>=0) { labels.add(ctx.hasLog10PError() && ctx.getPhredScaledQual()>=this.minQuality ? ".":"LOWQUAL" ); } if(byImpact || bybiotype) { String biotype=null; AnnPredictionParser.Impact impact=null; for(final AnnPredictionParser.AnnPrediction pred: super.annParser.getPredictions(ctx)) { final AnnPredictionParser.Impact currImpact = pred.getPutativeImpact(); if(impact!=null && currImpact.compareTo(impact)<0) continue; impact=currImpact; biotype= pred.getTranscriptBioType(); } if(byImpact) labels.add(impact==null?".":impact); if(bybiotype) labels.add(biotype==null?".":biotype); } if(bynalts)labels.add(ctx.getAlternateAlleles().size()); if(byAffected || byCalled || bySingleton) { int nc=0; int ng=0; int nsingles=0; for(int i=0;i< ctx.getNSamples();++i) { final Genotype g= ctx.getGenotype(i); if(!(g.isNoCall() || g.isHomRef())) { ng++; } if(g.isCalled()) { nc++; if(!g.isHomRef()) { nsingles++; } } } if(byCalled) labels.add(nc< maxSamples?nc:"GE_"+maxSamples); if(byAffected) labels.add(ng< maxSamples?ng:"GE_"+maxSamples); if(bySingleton) labels.add(nsingles==1?"SINGLETON":"."); } if(byTsv) { if(ctx.getType()==VariantContext.Type.SNP && ctx.getAlternateAlleles().size()==1) { boolean b=(VariantContextUtils.isTransition(ctx)); labels.add(b?"Transition":"Transversion"); } else { labels.add("."); } } if(byAlleleSize) { // see http://stackoverflow.com/questions/41678374/ final Predicate<Allele> afilter = new Predicate<Allele>() { @Override public boolean test(final Allele a) { return !(a.isNoCall() || a.isSymbolic() ); } }; final OptionalInt longest = ctx.getAlleles().stream().filter(afilter).mapToInt(new ToIntFunction<Allele>() { public int applyAsInt(final Allele value) {return value.length();}; }).max(); labels.add(longest.isPresent()?alleleSizeClassifier.apply(longest.getAsInt()):"N/A"); } if(byAlleFrequency) { final List<Object> afs= ctx.getAttributeAsList("AF"); if(afs.isEmpty()) { labels.add("NOT_AVAILABLE"); } else { Double minaf=null; for(final Object o:afs) { final Double af; if(o==null) continue; if(o instanceof Double) { af = Double.class.cast(o); } else { try { af=Double.parseDouble(String.valueOf(o)); } catch(NumberFormatException err) { logger.warn("Not a number for AF :"+o); continue; } } if(af<0.0) logger.warn("AF < 0 : "+o); if(af>1.0) logger.warn("AF > 1.0 : "+o); if(minaf==null || af.compareTo(minaf)<0) { minaf=af; } } labels.add(minaf==null?"NOT_FOUND":this.groupByAfClassifier.apply(minaf)); } } if(byDepth) { final List<Object> depths= ctx.getAttributeAsList("DP"); if(depths.size()!=1) { if(depths.size()>1) { logger.warn("Too many data for DP :"+depths); } labels.add("NOT_AVAILABLE"); } else { Integer dp=null; final Object o = depths.get(0); if(o!=null && o instanceof Integer) { dp = Integer.class.cast(o); } else { try { int i=Integer.parseInt(String.valueOf(o)); dp=i; } catch(NumberFormatException err) { logger.warn("Not a number for DP :"+o); dp=null; } } labels.add(this.groupByDpClassifier.apply(dp)); } } for(final String att:this.presence_of_attributes) { labels.add(ctx.hasAttribute(att)); } final Category cat=new Category(labels); Long n = count.get(cat); count.put(cat, n==null?1L:n+1); } return count; } @Override protected GATKReportTable createGATKReportTable() { final GATKReportTable table=new GATKReportTable( "Variants", "Variants "+variants.getSource(),0); if(bychrom) table.addColumn("CONTIG"); if(byID) table.addColumn("IN_DBSNP"); if(byType) table.addColumn("TYPE"); if(byFilter) table.addColumn("FILTERED"); if(minQuality>=0) table.addColumn("QUAL_GE_"+this.minQuality); if(byImpact) table.addColumn("IMPACT"); if(bybiotype) table.addColumn("BIOTYPE"); if(bynalts) table.addColumn("N_ALT_ALLELES"); if(byCalled) table.addColumn("CALLED_SAMPLES"); if(byAffected) table.addColumn("AFFECTED_SAMPLES"); if(bySingleton) table.addColumn("SINGLETON"); if(byTsv) table.addColumn("Ts/Tv"); if(byAlleleSize) table.addColumn("ALLELE_SIZE"); if(byAlleFrequency) table.addColumn("ALLELE_FREQUENCY"); if(byDepth) table.addColumn("DEPTH"); for(final String att:this.presence_of_attributes) { table.addColumn("ATT:"+att); } return table; } }