/** * */ package com.github.lindenb.jvarkit.tools.vcfcmp; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; import com.github.lindenb.jvarkit.util.so.SequenceOntologyTree; import com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParser; import com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParser.SnpEffPrediction; import com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParserFactory; import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser; import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser.VepPrediction; import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.SortingCollection; import htsjdk.variant.variantcontext.VariantContext; /** * @author lindenb * */ @Program(name="vcfcmppred",description="Compare predictions (SNPEff, VEP) for several VCFs") public class VCFComparePredictions extends AbstractVCFCompareBase { private VCFComparePredictions() { } private final Logger LOG=Logger.build(VCFComparePredictions.class).make(); static private class PredictionTuple { VepPredictionParser vepPredictionParser; SnpEffPredictionParser snpEffPredictionParser; } private static void startLine(PrintWriter out,VariantContext ctx) { out.print(ctx.getContig()); out.print("\t"); out.print(ctx.getStart()); out.print("\t"); out.print(ctx.hasID()?ctx.getID():"."); out.print("\t"); out.print(ctx.getReference()==null?".":ctx.getReference().getDisplayString()); } private static Set<SequenceOntologyTree.Term> unshared( Set<SequenceOntologyTree.Term> set1, Set<SequenceOntologyTree.Term> set2 ) { Set<SequenceOntologyTree.Term> so_terms=new HashSet<SequenceOntologyTree.Term>(); for(SequenceOntologyTree.Term t:set1) { if(!set2.contains(t)) so_terms.add(t); } for(SequenceOntologyTree.Term t:set2) { if(!set1.contains(t)) so_terms.add(t); } return so_terms; } private static Set<SequenceOntologyTree.Term> getVepSoTerms(VepPredictionParser parser,VariantContext ctx) { Set<SequenceOntologyTree.Term> so_terms=new HashSet<SequenceOntologyTree.Term>(); for(VepPrediction pred:parser.getPredictions(ctx)) { so_terms.addAll(pred.getSOTerms()); } return so_terms; } private static void printSOSet(PrintWriter out, Set<SequenceOntologyTree.Term> set) { for(SequenceOntologyTree.Term t:set) { out.print(" "); out.print(t.getAcn()+"["+t.getLabel()+"]"); } } private static void printDiscordantSO( PrintWriter out, Input input1, Set<SequenceOntologyTree.Term> so1, Input input2, Set<SequenceOntologyTree.Term> so2 ) { out.print("\t"); Set<SequenceOntologyTree.Term> set=new HashSet<>(so1); set.removeAll(so2); if(!set.isEmpty()) { out.print("only in "+input1.filename+":"); printSOSet(out,set); } out.print("\t"); set=new HashSet<>(so2); set.removeAll(so1); if(!set.isEmpty()) { out.print("only in "+input2.filename+":"); printSOSet(out,set); } out.println(); } private static Set<SequenceOntologyTree.Term> getSnpEffSoTerms(SnpEffPredictionParser parser,VariantContext ctx) { Set<SequenceOntologyTree.Term> so_terms=new HashSet<SequenceOntologyTree.Term>(); for(SnpEffPrediction pred:parser.getPredictions(ctx)) { so_terms.addAll(pred.getSOTerms()); } return so_terms; } @Override public int doWork(List<String> args) { PrintWriter out=null; SortingCollection<LineAndFile> variants=null; try { if(args.isEmpty()) { LOG.error("Illegal number of arguments"); return -1; } out= super.openFileOrStdoutAsPrintWriter(super.outputFile); variants=SortingCollection.newInstance( LineAndFile.class, new AbstractVCFCompareBase.LineAndFileCodec(), new AbstractVCFCompareBase.LineAndFileComparator(), super.sortingCollectionArgs.getMaxRecordsInRam(), super.sortingCollectionArgs.getTmpDirectories() ); variants.setDestructiveIteration(true); for(final String filename:args) { LOG.info("Reading from "+filename); Input input=super.put(variants, filename); LOG.info("end reading "+input.filename); } List<PredictionTuple> predictionTuples=new ArrayList<PredictionTuple>(super.inputs.size()); for(AbstractVCFCompareBase.Input input:this.inputs) { PredictionTuple predictionTuple=new PredictionTuple(); predictionTuple.snpEffPredictionParser=new SnpEffPredictionParserFactory(input.codecAndHeader.header).get(); predictionTuple.vepPredictionParser=new VepPredictionParserFactory(input.codecAndHeader.header).get(); predictionTuples.add(predictionTuple); } List<AbstractVCFCompareBase.LineAndFile> row=new ArrayList<LineAndFile>(super.inputs.size()); CloseableIterator<LineAndFile> iter=variants.iterator(); final Comparator<LineAndFile> posCompare = (A,B)->A.getContigPosRef().compareTo(B.getContigPosRef()); for(;;) { LineAndFile rec=null; if(iter.hasNext()) { rec=iter.next(); } if(rec==null || (!row.isEmpty() && posCompare.compare(row.get(0),rec)!=0)) { if(!row.isEmpty()) { boolean printed=false; VariantContext ctx=row.get(0).getContext(); if(row.size()!=this.inputs.size()) { startLine(out,ctx); out.println("\tDiscordant number of variants"); printed=true; } for(int i=0;i+1< row.size();++i) { Input input1=this.inputs.get(row.get(i).fileIdx); VariantContext ctx1=row.get(i).getContext(); PredictionTuple predtuple1=predictionTuples.get(row.get(i).fileIdx); List<VepPrediction> vepPredictions1= predtuple1.vepPredictionParser.getPredictions(ctx1); List<SnpEffPrediction> snpEffPredictions1= predtuple1.snpEffPredictionParser.getPredictions(ctx1); Set<SequenceOntologyTree.Term> so_vep_1= getVepSoTerms(predtuple1.vepPredictionParser,ctx1); Set<SequenceOntologyTree.Term> so_snpeff_1= getSnpEffSoTerms(predtuple1.snpEffPredictionParser,ctx1); for(int j=i+1;j< row.size();++j) { Input input2=this.inputs.get(row.get(j).fileIdx); VariantContext ctx2=row.get(j).getContext(); PredictionTuple predtuple2=predictionTuples.get(row.get(j).fileIdx); List<VepPrediction> vepPredictions2= predtuple2.vepPredictionParser.getPredictions(ctx2); List<SnpEffPrediction> snpEffPredictions2= predtuple2.snpEffPredictionParser.getPredictions(ctx2); Set<SequenceOntologyTree.Term> so_vep_2= getVepSoTerms(predtuple2.vepPredictionParser,ctx2); Set<SequenceOntologyTree.Term> so_snpeff_2= getSnpEffSoTerms(predtuple2.snpEffPredictionParser,ctx2); if(vepPredictions1.size()!=vepPredictions2.size()) { startLine(out,ctx); out.print("\tVEP discordant transcripts count"); out.print("\t"+input1.filename+":"+vepPredictions1.size()); out.print("\t"+input2.filename+":"+vepPredictions2.size()); out.println(); printed=true; } if(snpEffPredictions1.size()!=snpEffPredictions2.size()) { startLine(out,ctx); out.print("\tSNPEFF discordant transcripts count"); out.print("\t"+input1.filename+":"+snpEffPredictions1.size()); out.print("\t"+input2.filename+":"+snpEffPredictions2.size()); out.println(); printed=true; } if(!unshared(so_vep_1, so_vep_2).isEmpty()) { startLine(out,ctx); out.print("\tVEP discordant SO:terms"); printDiscordantSO(out, input1, so_vep_1, input2, so_vep_2 ); printed=true; } if(!unshared(so_snpeff_1, so_snpeff_2).isEmpty()) { startLine(out,ctx); out.print("\tSNPEFF discordant SO:terms"); printDiscordantSO(out, input1, so_snpeff_1, input2, so_snpeff_2 ); printed=true; } } } if(!printed) { startLine(out,ctx); out.println("\tPASS"); } row.clear(); } if(rec==null) break; } row.add(rec); } iter.close(); out.flush(); out.close();out=null; return 0; } catch(Exception err) { LOG.error(err); return -1; } finally { CloserUtil.close(out); try { if(variants!=null) variants.cleanup(); } catch(Exception err) { } } } public static void main(String[] args) { new VCFComparePredictions().instanceMainWithExit(args); } }