package com.github.lindenb.jvarkit.tools.vcfcmp; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeSet; import javax.xml.stream.XMLOutputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamWriter; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.SortingCollection; import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.readers.LineIteratorImpl; import htsjdk.tribble.readers.LineReader; import htsjdk.tribble.readers.SynchronousLineReader; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeType; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.AbstractVCFCodec; import htsjdk.variant.vcf.VCFHeader; import com.beust.jcommander.Parameter; import com.beust.jcommander.ParametersDelegate; import com.github.lindenb.jvarkit.io.IOUtils; import com.github.lindenb.jvarkit.util.Counter; import com.github.lindenb.jvarkit.util.jcommander.Launcher; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; import com.github.lindenb.jvarkit.util.picard.AbstractDataCodec; import com.github.lindenb.jvarkit.util.so.SequenceOntologyTree; import com.github.lindenb.jvarkit.util.so.SequenceOntologyTree.Term; import com.github.lindenb.jvarkit.util.vcf.ContigPosRef; import com.github.lindenb.jvarkit.util.vcf.VCFUtils; import com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser; import com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory; import com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParser; import com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParserFactory; import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser; import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory; @Program(name="vcfcompare",description="Compares two VCF files") public class VCFCompare extends Launcher { private static final Logger LOG = Logger.build(VCFCompare.class).make(); @Parameter(names={"-o","--output"},description="Output file. Optional . Default: stdout") private File outputFile = null; @ParametersDelegate WritingSortingCollection writingSortingCollection=new WritingSortingCollection(); private Input inputs[]=new Input[]{null,null}; private abstract class Venn0 { final String title; Venn0(final String title) { this.title=title; } VariantContext filter(final VariantContext ctx,int file_id) { return ctx; } abstract void visit(final VariantContext ctx[]); void write(XMLStreamWriter w) throws XMLStreamException { w.writeStartElement("div"); w.writeStartElement("table"); w.writeStartElement("thead"); w.writeStartElement("caption"); w.writeCharacters(String.valueOf(title)); w.writeEndElement();//caption w.writeStartElement("tr"); for(String k:new String[]{"Key","Count"}) { w.writeStartElement("th"); w.writeCharacters(k); w.writeEndElement(); } w.writeEndElement();//tr w.writeEndElement();//thead w.writeStartElement("tbody"); /* for(String k:count.keySet()) { w.writeStartElement("tr"); w.writeStartElement("td"); w.writeCharacters(k); w.writeEndElement(); w.writeStartElement("td"); w.writeCharacters(String.valueOf(count.count(k))); w.writeEndElement(); w.writeEndElement();//tr } w.writeEndElement();//tbody */ w.writeEndElement();//table w.writeEndElement();//div } } private class Venn1 extends Venn0 { int uniq[]=new int[]{0,0}; int comm=0; Venn1(String title) { super(title); } void visit(VariantContext ctx[]) { VariantContext ctx0=filter(ctx[0], 0); VariantContext ctx1=filter(ctx[1], 1); if(ctx0!=null) { if(ctx1!=null) { comm++; } else { uniq[0]++; } } else if(ctx1!=null) { uniq[1]++; } } String getSampleName() { return "*"; } void write(XMLStreamWriter w) throws XMLStreamException { w.writeStartElement("tr"); w.writeStartElement("th"); w.writeCharacters(String.valueOf(title)); w.writeEndElement(); w.writeStartElement("th"); w.writeCharacters(getSampleName()); w.writeEndElement(); w.writeEndElement(); } } private class VennType extends Venn1 { private VariantContext.Type type; public VennType(VariantContext.Type type) { super(type.name()); this.type=type; } @Override VariantContext filter(VariantContext ctx, int file_id) { return ctx==null || !this.type.equals(ctx.getType())?null:ctx; } } private abstract class VennPred extends Venn1 { private SequenceOntologyTree.Term term; public VennPred( String prefix, SequenceOntologyTree.Term term) { super(prefix+" "+term.getAcn()); this.term=term; } abstract Set<SequenceOntologyTree.Term> terms(VariantContext ctx,int file_id); @Override VariantContext filter(VariantContext ctx, int file_id) { if(ctx==null) return null; return terms(ctx,file_id).contains(this.term)?ctx:null; } } private static class Transition<X> { X from; X to; Transition(X from,X to) { this.from=from; this.to=to; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((from == null) ? 0 : from.hashCode()); result = prime * result + ((to == null) ? 0 : to.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (!(obj instanceof Transition)) return false; @SuppressWarnings("unchecked") Transition<X> other = (Transition<X>) obj; return from.equals(other.from) && to.equals(other.to); } @Override public String toString() { return from.toString()+" "+to.toString(); } } private abstract class Venn2 extends Venn0 { private String sample; Venn2(String title,String sample) { super(title); this.sample=sample; } abstract void visit(Genotype[] g); @Override void visit(VariantContext[] ctx) { VariantContext v0=filter(ctx[0],0); if(v0==null) return; Genotype g0=v0.getGenotype(this.sample); if(g0==null) return; VariantContext v1=filter(ctx[1],1); if(v1==null) return; Genotype g1=v1.getGenotype(this.sample); if(g1==null) return; visit(new Genotype[]{g0,g1}); } } private class VennGType extends Venn2 { Counter<Transition<GenotypeType>> count=new Counter<Transition<GenotypeType>>(); VennGType(String sample) { super("genotypes",sample); } @Override void visit(Genotype[] g) { count.incr(new Transition<GenotypeType>(g[0].getType(), g[1].getType())); } } private class Input { String filename; VCFHeader header; AbstractVCFCodec codec; SnpEffPredictionParser snpEffPredictionParser=null; VepPredictionParser vepPredictionParser=null; AnnPredictionParser annPredictionParser=null; } private class LineAndFile { int fileIdx; String line; private VariantContext _ctx=null; VariantContext getContext() { if(this._ctx==null) { this._ctx=inputs[this.fileIdx].codec.decode(this.line); } return this._ctx; } ContigPosRef getContigPosRef() { return new ContigPosRef(getContext()); } } private class LineAndFileCodec extends AbstractDataCodec<LineAndFile> { @Override public LineAndFile decode(DataInputStream dis) throws IOException { LineAndFile v=new LineAndFile(); try { v.fileIdx=dis.readInt(); } catch (Exception e) { return null; } v.line=readString(dis); return v; } @Override public void encode(DataOutputStream dos, LineAndFile v) throws IOException { dos.writeInt(v.fileIdx); writeString(dos,v.line); } @Override public AbstractDataCodec<LineAndFile> clone() { return new LineAndFileCodec(); } } private class LineAndFileComparator implements Comparator<LineAndFile> { @Override public int compare(final LineAndFile v1, final LineAndFile v2) { final ContigPosRef ctx1=v1.getContigPosRef(); final ContigPosRef ctx2=v2.getContigPosRef(); return ctx1.compareTo(ctx2); } } private VCFCompare() { } @Override public int doWork(final List<String> args) { if(args.isEmpty()) { LOG.error("VCFs missing."); return -1; } if(args.size()!=2) { System.err.println("Illegal number or arguments. Expected two VCFs"); return -1; } PrintWriter pw =null; XMLStreamWriter w=null; InputStream in=null; SortingCollection<LineAndFile> variants=null; try { LineAndFileComparator varcmp=new LineAndFileComparator(); variants=SortingCollection.newInstance(LineAndFile.class, new LineAndFileCodec(), varcmp, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpDirectories() ); variants.setDestructiveIteration(true); for(int i=0;i< 2;++i) { this.inputs[i]= new Input(); this.inputs[i].codec=VCFUtils.createDefaultVCFCodec(); this.inputs[i].filename= args.get(i); LOG.info("Opening "+this.inputs[i].filename); in=IOUtils.openURIForReading(this.inputs[i].filename); final LineReader lr= new SynchronousLineReader(in); final LineIterator li=new LineIteratorImpl(lr); this.inputs[i].header=(VCFHeader)this.inputs[i].codec.readActualHeader(li); this.inputs[i].vepPredictionParser=new VepPredictionParserFactory(this.inputs[i].header).get(); this.inputs[i].snpEffPredictionParser=new SnpEffPredictionParserFactory(this.inputs[i].header).get(); this.inputs[i].annPredictionParser=new AnnPredictionParserFactory(this.inputs[i].header).get(); while(li.hasNext()) { LineAndFile laf=new LineAndFile(); laf.fileIdx=i; laf.line=li.next(); variants.add(laf); } LOG.info("Done Reading "+this.inputs[i].filename); CloserUtil.close(li); CloserUtil.close(lr); CloserUtil.close(in); } variants.doneAdding(); LOG.info("Done Adding"); Set<String> commonSamples=new TreeSet<String>(this.inputs[0].header.getSampleNamesInOrder()); commonSamples.retainAll(this.inputs[1].header.getSampleNamesInOrder()); List<Venn0> venn1List=new ArrayList<VCFCompare.Venn0>(); venn1List.add(new Venn1("ALL")); venn1List.add(new Venn1("having ID") { @Override public VariantContext filter(VariantContext ctx,int fileIndex) { return ctx==null || !ctx.hasID()?null:ctx; } }); venn1List.add(new Venn1("QUAL greater 30") { @Override public VariantContext filter(VariantContext ctx,int fileIndex) { return ctx==null || !ctx.hasLog10PError() || ctx.getPhredScaledQual()<30.0?null:ctx; } }); for(VariantContext.Type t: VariantContext.Type.values()) { venn1List.add(new VennType(t)); } for(SequenceOntologyTree.Term term:SequenceOntologyTree.getInstance().getTerms()) { venn1List.add(new VennPred("vep",term) { @Override Set<Term> terms(VariantContext ctx, int file_id) { Set<Term> tt=new HashSet<SequenceOntologyTree.Term>(); for(VepPredictionParser.VepPrediction pred:VCFCompare.this.inputs[file_id].vepPredictionParser.getPredictions(ctx)) { tt.addAll(pred.getSOTerms()); } return tt; } }); venn1List.add(new VennPred("SnpEff",term) { @Override Set<Term> terms(VariantContext ctx, int file_id) { Set<Term> tt=new HashSet<SequenceOntologyTree.Term>(); for(SnpEffPredictionParser.SnpEffPrediction pred:VCFCompare.this.inputs[file_id].snpEffPredictionParser.getPredictions(ctx)) { tt.addAll(pred.getSOTerms()); } return tt; } }); venn1List.add(new VennPred("ANN",term) { @Override Set<Term> terms(VariantContext ctx, int file_id) { Set<Term> tt=new HashSet<SequenceOntologyTree.Term>(); for(AnnPredictionParser.AnnPrediction pred:VCFCompare.this.inputs[file_id].annPredictionParser.getPredictions(ctx)) { tt.addAll(pred.getSOTerms()); } return tt; } }); } for(String s:commonSamples) { venn1List.add(new VennGType(s)); } /* START : digest results ====================== */ Counter<String> diff=new Counter<String>(); List<LineAndFile> row=new ArrayList<LineAndFile>(); CloseableIterator<LineAndFile> iter=variants.iterator(); for(;;) { LineAndFile rec=null; if(iter.hasNext()) { rec=iter.next(); } if(rec==null || (!row.isEmpty() && varcmp.compare(row.get(0),rec)!=0)) { if(!row.isEmpty()) { diff.incr("count.variations"); VariantContext contexes_init[]=new VariantContext[]{null,null}; for(LineAndFile var:row) { if(contexes_init[var.fileIdx]!=null) { LOG.error("Duplicate context in "+inputs[var.fileIdx].filename+" : "+var.line); continue; } contexes_init[var.fileIdx]=var.getContext(); } for(Venn0 venn: venn1List) { venn.visit(contexes_init); } row.clear(); } if(rec==null) break; } row.add(rec); } iter.close(); /* END : digest results ====================== */ pw = super.openFileOrStdoutAsPrintWriter(outputFile); XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance(); w= xmlfactory.createXMLStreamWriter(pw); w.writeStartElement("html"); w.writeStartElement("body"); /* specific samples */ w.writeStartElement("div"); w.writeStartElement("dl"); for(int i=0;i< 3;++i) { String title; Set<String> samples; switch(i) { case 0: case 1: title="Sample(s) for "+this.inputs[i].filename+"."; samples=new TreeSet<String>(this.inputs[i].header.getSampleNamesInOrder()); samples.removeAll(commonSamples); break; default: title="Common Sample(s)."; samples=new TreeSet<String>(commonSamples); break; } w.writeStartElement("dt"); w.writeCharacters(title); w.writeEndElement(); w.writeStartElement("dd"); w.writeStartElement("ol"); for(String s:samples) { w.writeStartElement("li"); w.writeCharacters(s); w.writeEndElement(); } w.writeEndElement(); w.writeEndElement(); } w.writeEndElement();//dl w.writeEndElement();//div for(Venn0 v: venn1List) { v.write(w); } w.writeEndElement();//body w.writeEndElement();//html w.writeEndDocument(); w.close();w=null; pw.flush(); pw.close(); pw=null; } catch(Exception err) { LOG.error(err); return -1; } finally { CloserUtil.close(w); CloserUtil.close(pw); if(variants!=null) variants.cleanup(); } return 0; } /** * @param args */ public static void main(String[] args) { new VCFCompare().instanceMainWithExit(args); } }