package com.github.lindenb.jvarkit.util.vcf.rdf; import java.io.IOException; import java.io.OutputStream; import java.net.URI; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.logging.Logger; import javax.xml.XMLConstants; import javax.xml.stream.XMLOutputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamWriter; import com.github.lindenb.jvarkit.tools.vcfannot.VCFPredictions; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFFilterHeaderLine; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFInfoHeaderLine; import com.github.lindenb.jvarkit.util.so.SequenceOntologyTree.Term; import com.github.lindenb.jvarkit.util.vcf.predictions.MyPredictionParser; import com.github.lindenb.jvarkit.util.vcf.predictions.Prediction; import com.github.lindenb.jvarkit.util.vcf.predictions.PredictionParser; import com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParser; import com.github.lindenb.jvarkit.util.vcf.predictions.SnpEffPredictionParserFactory; import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser; import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory; /** * RDFVcfWriter * @author lindenb * */ public class RDFVcfWriter implements VariantContextWriter { private static final Logger LOG=Logger.getLogger("jvarkit"); private static final String XSD="http://www.w3.org/2001/XMLSchema#"; private static final String RDF=com.github.lindenb.jvarkit.util.ns.RDF.NS; private static final String DC="http://purl.org/dc/elements/1.1/"; private static final String NS="http://github.com/lindenb/jvarkit/"; private static final String PFX="vcf"; private XMLStreamWriter w; private VCFHeader header; private long id_generator=0L; private OutputStream delegateOut; private Map<String,RDFVcfInfoHandler> key2infoHandler=new HashMap<String,RDFVcfInfoHandler>(); private URI source=null; private boolean _xmlHeaderPrinted=false; public RDFVcfWriter(XMLStreamWriter writer) { this.w=writer; } public RDFVcfWriter(OutputStream delegateOut) throws IOException { try { this.delegateOut=delegateOut; XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance(); this.w= xmlfactory.createXMLStreamWriter(delegateOut,"UTF-8"); } catch (XMLStreamException e) { throw new IOException(e); } } public void addInfoHandler(RDFVcfInfoHandler handler) { this.key2infoHandler.put(handler.getKey(), handler); } private void datatype(String t) throws XMLStreamException { this.w.writeAttribute("rdf", RDF, "datatype", "xsd:"+t); } protected RDFVcfInfoHandler createDefaultRdfVcfInfoHandlerFor(VCFInfoHeaderLine h) { return new DefaultInfoHandler(h); } public void writeStartDocument() throws XMLStreamException { if(_xmlHeaderPrinted) return; w.writeStartDocument("UTF-8","1.0"); this.w.writeStartElement("rdf", "RDF", RDF); w.writeAttribute("xmlns", XMLConstants.XML_NS_URI, "rdf",RDF); w.writeAttribute("xmlns", XMLConstants.XML_NS_URI, "dc", DC); w.writeAttribute("xmlns", XMLConstants.XML_NS_URI, PFX, NS); w.writeAttribute("xmlns", XMLConstants.XML_NS_URI, "xsd", XSD); _xmlHeaderPrinted=true; } @Override public void writeHeader(VCFHeader header) { this.writeHeader(header, null); } public void writeHeader(VCFHeader header,URI source) { if(this.header!=null) throw new RuntimeException("Header was already written"); this.header=header; this.source=source; if(this.source==null) this.source=URI.create("urn:source/id"+(++id_generator)); try { writeStartDocument(); this.w.writeStartElement(PFX, "Source", NS); this.w.writeAttribute("rdf",RDF,"about",this.source.toString()); this.w.writeStartElement("dc","title",DC); this.w.writeCharacters(this.source.toString()); this.w.writeEndElement();//dc:title this.w.writeEndElement(); SAMSequenceDictionary dict=header.getSequenceDictionary(); if(dict!=null) { for(SAMSequenceRecord ssr:dict.getSequences()) { this.w.writeStartElement(PFX, "Chromosome", NS); this.w.writeAttribute("rdf",RDF,"about","urn:chromosome/"+ssr.getSequenceName()); this.w.writeStartElement("dc","title",DC); this.w.writeCharacters(ssr.getSequenceName()); this.w.writeEndElement();//dc:title this.w.writeStartElement(PFX,"length",NS); datatype("int"); this.w.writeCharacters(String.valueOf(ssr.getSequenceLength())); this.w.writeEndElement();//length this.w.writeStartElement(PFX,"index",NS); datatype("int"); this.w.writeCharacters(String.valueOf(ssr.getSequenceIndex())); this.w.writeEndElement();//length this.w.writeEndElement();//Chromosome } } key2infoHandler.put(SnpEffPredictionParser.getDefaultTag(), new SnpEffHandler()); key2infoHandler.put(VepPredictionParser.getDefaultTag(), new VepHandler()); key2infoHandler.put(VCFPredictions.TAG, new MyPredictionHandler()); for(VCFInfoHeaderLine h:header.getInfoHeaderLines()) { RDFVcfInfoHandler handler= key2infoHandler.get(h.getID()); if(handler==null) { LOG.info("creating default handler for INFO:"+h.getID()); handler=createDefaultRdfVcfInfoHandlerFor(h); key2infoHandler.put(handler.getKey(), handler); } handler.init( h); } for(VCFFilterHeaderLine h:header.getFilterLines()) { this.w.writeStartElement(PFX, "Filter", NS); this.w.writeAttribute("rdf",RDF,"about","urn:filter/"+h.getKey()); this.w.writeStartElement("dc","title",DC); this.w.writeCharacters(h.getKey()); this.w.writeEndElement();//dc:title this.w.writeStartElement("dc","description",DC); this.w.writeCharacters(h.getValue()); this.w.writeEndElement();//dc:title this.w.writeEndElement();//Filter } //Sample for(String sample:header.getSampleNamesInOrder()) { this.w.writeStartElement(PFX, "Sample", NS); this.w.writeAttribute("rdf",RDF,"about","urn:sample/"+sample); this.w.writeStartElement("dc","title",DC); this.w.writeCharacters(sample); this.w.writeEndElement();//dc:title this.w.writeEndElement();//rdf:RDF } } catch(Exception e) { throw new RuntimeException("close failed",e); } } @Override public void add(VariantContext ctx) { if(this.header==null) throw new RuntimeException("No header was written."); try { long variant_id=++id_generator; this.w.writeStartElement(PFX, "Variant", NS); this.w.writeAttribute("rdf",RDF,"about","urn:variant/"+variant_id); this.w.writeEmptyElement(PFX, "source",NS); this.w.writeAttribute("rdf",RDF,"resource",this.source.toString()); this.w.writeEmptyElement(PFX, "chromosome",NS); this.w.writeAttribute("rdf",RDF,"resource","urn:chromosome/"+ctx.getContig()); this.w.writeStartElement(PFX,"start",NS); datatype("int"); this.w.writeCharacters(String.valueOf(ctx.getStart())); this.w.writeEndElement(); this.w.writeStartElement(PFX,"end",NS); datatype("int"); this.w.writeCharacters(String.valueOf(ctx.getEnd())); this.w.writeEndElement(); if(ctx.hasID()) { if(ctx.getID().matches("rs[0-9]+")) { this.w.writeEmptyElement(PFX,"ID",NS); this.w.writeAttribute("rdf",RDF,"resource", "http://www.ncbi.nlm.nih.gov/snp/"+ ctx.getID().substring(2) ); } else { this.w.writeStartElement(PFX,"ID",NS); this.w.writeCharacters(ctx.getID()); this.w.writeEndElement(); } } this.w.writeStartElement(PFX,"ref",NS); this.w.writeCharacters(ctx.getReference().getBaseString()); this.w.writeEndElement(); for(Allele a:ctx.getAlleles()) { this.w.writeStartElement(PFX,"alt",NS); this.w.writeCharacters(a.getBaseString()); this.w.writeEndElement(); } if(ctx.hasLog10PError()) { this.w.writeStartElement(PFX,"qual",NS); datatype("double"); this.w.writeCharacters(String.valueOf(ctx.getPhredScaledQual())); this.w.writeEndElement(); } for(String filt:ctx.getFilters()) { this.w.writeEmptyElement(PFX,"filter",NS); this.w.writeAttribute("rdf",RDF,"resource", "urn:filter/"+ filt ); } //INFO for(String key:ctx.getAttributes().keySet()) { RDFVcfInfoHandler handler=this.key2infoHandler.get(key); if(handler==null) continue; handler.handle(ctx); } this.w.writeEndElement();//Variant for(Genotype g:ctx.getGenotypes()) { if(!g.isAvailable()) continue; if(!g.isCalled()) continue; long genotype_id=++id_generator; this.w.writeStartElement(PFX, "Genotype", NS); this.w.writeAttribute("rdf",RDF,"about","urn:genotype/"+genotype_id); this.w.writeEmptyElement(PFX, "sample",NS); this.w.writeAttribute("rdf",RDF,"resource","urn:sample/"+g.getSampleName()); this.w.writeEmptyElement(PFX, "variant",NS); this.w.writeAttribute("rdf",RDF,"resource","urn:variant/"+variant_id); /* this.w.writeEmptyElement("rdf", "type",RDF); this.w.writeAttribute("rdf",RDF,"resource","urn:genotype/"+(g.isAvailable()?"available":"unavaliable")); */ /* if(g.isCalled() ) { this.w.writeEmptyElement("rdf", "type",RDF); this.w.writeAttribute("rdf",RDF,"resource","urn:genotype/called"); }*/ if(g.isFiltered()) { this.w.writeEmptyElement("rdf", "type",RDF); this.w.writeAttribute("rdf",RDF,"resource","urn:genotype/filtered"); } if(g.isHom()) { this.w.writeEmptyElement("rdf", "type",RDF); this.w.writeAttribute("rdf",RDF,"resource","urn:genotype/hom"); } if(g.isHet()) { this.w.writeEmptyElement("rdf", "type",RDF); this.w.writeAttribute("rdf",RDF,"resource","urn:genotype/het"); } if(g.isHomRef()) { this.w.writeEmptyElement("rdf", "type",RDF); this.w.writeAttribute("rdf",RDF,"resource","urn:genotype/homRef"); } if(g.isHomVar()) { this.w.writeEmptyElement("rdf", "type",RDF); this.w.writeAttribute("rdf",RDF,"resource","urn:genotype/homVar"); } if(g.isMixed()) { this.w.writeEmptyElement("rdf", "type",RDF); this.w.writeAttribute("rdf",RDF,"resource","urn:genotype/mixed"); } if(g.isPhased()) { this.w.writeEmptyElement("rdf", "type",RDF); this.w.writeAttribute("rdf",RDF,"resource","urn:genotype/phased"); } Set<String> seen=new HashSet<String>(); for(Allele a:g.getAlleles()) { if(a.isNoCall() || seen.contains(a.getBaseString())) continue; this.w.writeStartElement(PFX,"allele",NS); this.w.writeCharacters(a.getBaseString()); this.w.writeEndElement(); seen.add(a.getBaseString()); } if(g.hasDP()) { this.w.writeStartElement(PFX,"dp",NS); datatype("int"); this.w.writeCharacters(String.valueOf(g.getDP())); this.w.writeEndElement(); } if(g.hasGQ()) { this.w.writeStartElement(PFX,"gq",NS); datatype("int"); this.w.writeCharacters(String.valueOf(g.getGQ())); this.w.writeEndElement(); } if(g.hasPL()) { int pl[]=g.getPL(); this.w.writeStartElement(PFX,"pl",NS); for(int i=0;i<pl.length;++i) { if(i>0) this.w.writeCharacters(","); this.w.writeCharacters(String.valueOf(pl[i])); } this.w.writeEndElement(); } this.w.writeEndElement(); } } catch(XMLStreamException e) { throw new RuntimeException("add failed",e); } } @Override public void close() { if(this.w==null) return; if(this.header==null) throw new RuntimeException("No header was written."); try { this.w.writeEndElement();//rdf:RDF this.w.writeEndDocument(); this.w.flush(); this.w.close(); if(this.delegateOut!=null) {delegateOut.flush(); delegateOut.close();} this.w=null; } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("close failed",e); } } public interface RDFVcfInfoHandler { public String getKey(); public void init( VCFInfoHeaderLine line ) throws XMLStreamException; public void handle( VariantContext ctx ) throws XMLStreamException; } private abstract class AbstractInfoHandler implements RDFVcfInfoHandler { protected VCFInfoHeaderLine info; public AbstractInfoHandler(VCFInfoHeaderLine info) { this.info=info; } @Override public void init(VCFInfoHeaderLine line) throws XMLStreamException { w.writeStartElement(PFX,"InfoHeader",NS); w.writeAttribute("rdf", RDF, "about", "urn:info/"+line.getID()); w.writeStartElement(PFX,"id",NS); w.writeCharacters(line.getID()); w.writeEndElement(); w.writeStartElement(PFX,"description",NS); w.writeCharacters(line.getDescription()); w.writeEndElement(); w.writeStartElement(PFX,"countType",NS); w.writeCharacters(line.getCountType().name()); w.writeEndElement(); w.writeStartElement(PFX,"type",NS); w.writeCharacters(line.getType().name()); w.writeEndElement(); w.writeEndElement(); } protected abstract void handleObject(Object o) throws XMLStreamException; @Override public void handle(VariantContext ctx) throws XMLStreamException { Object o=ctx.getAttribute(this.getKey()); if(o==null) return; if(o.getClass().isArray()) { Object array[]=(Object[])o; for(Object o2:array) handleObject(o2); } else if(o instanceof Collection) { @SuppressWarnings("rawtypes") Collection array=(Collection)o; for(Object o2:array) handleObject(o2); } else { handleObject(o); } } @Override public String getKey() { return info.getID(); } }; private abstract class AbstractPredHandler implements RDFVcfInfoHandler { public AbstractPredHandler() { } abstract PredictionParser getPredictionParser(); abstract String getLocalName(); @Override public void init(VCFInfoHeaderLine line) throws XMLStreamException { // TODO Auto-generated method stub } @Override public void handle(VariantContext ctx) throws XMLStreamException { w.writeComment("X"+ctx.getAttribute(getPredictionParser().getTag())); for(final Prediction pred:this.getPredictionParser().getPredictions(ctx)) { /* w.writeStartElement(PFX,"prediction",NS); w.writeStartElement(PFX,getLocalName(),NS); Integer i=pred.getAminoAcidPosition(); if(i!=null) { w.writeStartElement(PFX,"aminoAcidPosition",NS); datatype("int"); w.writeCharacters(String.valueOf(i)); w.writeEndElement(); } String s=pred.getEnsemblTranscript(); if(s!=null) { w.writeEmptyElement(PFX,"enst",NS); w.writeAttribute("rdf",RDF,"resource","http://www.ensembl.org/"+s); } s=pred.getEnsemblProtein(); if(s!=null) { w.writeEmptyElement(PFX,"ensp",NS); w.writeAttribute("rdf",RDF,"resource","http://www.ensembl.org/"+s); } s=pred.getGeneName(); if(s!=null) { w.writeStartElement(PFX,"geneName",NS); w.writeCharacters(s); w.writeEndElement(); } s=pred.getReferenceAminoAcid(); if(s!=null) { w.writeStartElement(PFX,"refAA",NS); w.writeCharacters(s); w.writeEndElement(); } s=pred.getAltAminoAcid(); if(s!=null) { w.writeStartElement(PFX,"altAA",NS); w.writeCharacters(s); w.writeEndElement(); } for(Term term:pred.getSOTerms()) { w.writeStartElement(PFX,"so",NS); w.writeCharacters(term.getAcn()); w.writeEndElement(); } w.writeEndElement(); w.writeEndElement(); */ } } } private class SnpEffHandler extends AbstractPredHandler { private SnpEffPredictionParser predFactory; public SnpEffHandler() { this.predFactory=new SnpEffPredictionParserFactory(RDFVcfWriter.this.header).get(); } @Override PredictionParser getPredictionParser() { return predFactory; } @Override public String getKey() { return predFactory.getTag(); } @Override String getLocalName() { return "snpEff"; } } private class VepHandler extends AbstractPredHandler { private VepPredictionParser predFactory; public VepHandler() { this.predFactory=new VepPredictionParserFactory(RDFVcfWriter.this.header).get(); } @Override PredictionParser getPredictionParser() { return predFactory; } @Override public String getKey() { return predFactory.getTag(); } @Override String getLocalName() { return "vep"; } } private class MyPredictionHandler extends AbstractPredHandler { private MyPredictionParser predFactory; public MyPredictionHandler() { this.predFactory=new MyPredictionParser(RDFVcfWriter.this.header); } @Override PredictionParser getPredictionParser() { return predFactory; } @Override public String getKey() { return predFactory.getTag(); } @Override String getLocalName() { return "myprediction"; } } private class DefaultInfoHandler extends AbstractInfoHandler { public DefaultInfoHandler(VCFInfoHeaderLine info) { super(info); } @Override protected void handleObject(Object o) throws XMLStreamException { w.writeStartElement(PFX, getKey(), NS); if(o.getClass()==Double.class) { datatype("double"); } else if(o.getClass()==Float.class) { datatype("float"); } else if(o.getClass()==Integer.class) { datatype("int"); } w.writeCharacters(String.valueOf(o)); w.writeEndElement(); } }; @Override public boolean checkError() { return false; } }