/*
The MIT License (MIT)
Copyright (c) 2014 Pierre Lindenbaum
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
History:
* 2014 creation
*/
package com.github.lindenb.jvarkit.tools.vcf2rdf;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFilterHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import com.beust.jcommander.Parameter;
import com.github.lindenb.jvarkit.io.IOUtils;
import com.github.lindenb.jvarkit.util.Pedigree;
import com.github.lindenb.jvarkit.util.jcommander.Launcher;
import com.github.lindenb.jvarkit.util.jcommander.Program;
import com.github.lindenb.jvarkit.util.log.Logger;
import com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress;
import com.github.lindenb.jvarkit.util.vcf.VCFUtils;
import com.github.lindenb.jvarkit.util.vcf.VcfIterator;
import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser;
import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser.VepPrediction;
import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory;
/**
BEGIN_DOC
### Example
```
$ java -jar dist/vcf2rdf.jar < in.vcf | xmllint --format -
```
```
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:vcf="http://github.com/lindenb/jvarkit/" xmlns:xsd="http://www.w3.org/2001/XMLSchema#">
<vcf:Chromosome rdf:about="urn:chromosome/1">
<dc:title>1</dc:title>
<vcf:length rdf:datatype="xsd:int">249250621</vcf:length>
<vcf:index rdf:datatype="xsd:int">0</vcf:index>
</vcf:Chromosome>
<vcf:Chromosome rdf:about="urn:chromosome/2">
<dc:title>2</dc:title>
<vcf:length rdf:datatype="xsd:int">243199373</vcf:length>
(...)
<vcf:Chromosome rdf:about="urn:chromosome/GL000192.1">
<dc:title>GL000192.1</dc:title>
<vcf:length rdf:datatype="xsd:int">547496</vcf:length>
<vcf:index rdf:datatype="xsd:int">83</vcf:index>
</vcf:Chromosome>
<vcf:Filter rdf:about="urn:filter/FILTER">
<dc:title>FILTER</dc:title>
<dc:description/>
</vcf:Filter>
<vcf:Sample rdf:about="urn:sample/V2528">
<dc:title>V2528</dc:title>
</vcf:Sample>
<vcf:Variant rdf:about="urn:variant/1">
<vcf:chromosome rdf:resource="urn:chromosome/8"/>
<vcf:start rdf:datatype="xsd:int">10467571</vcf:start>
<vcf:end rdf:datatype="xsd:int">10467571</vcf:end>
<vcf:ref>C</vcf:ref>
<vcf:alt>C</vcf:alt>
<vcf:alt>T</vcf:alt>
<vcf:qual rdf:datatype="xsd:double">1311.77</vcf:qual>
<vcf:BaseQRankSum>-2.441</vcf:BaseQRankSum>
<vcf:HaplotypeScore>365.0758</vcf:HaplotypeScore>
<vcf:QD>5.33</vcf:QD>
<vcf:MLEAC>1</vcf:MLEAC>
<vcf:MQ>49.60</vcf:MQ>
<vcf:MLEAF>0.500</vcf:MLEAF>
<vcf:AC>1</vcf:AC>
<vcf:FS>88.037</vcf:FS>
<vcf:MQRankSum>-10.623</vcf:MQRankSum>
<vcf:ReadPosRankSum>-5.556</vcf:ReadPosRankSum>
<vcf:Dels>0.02</vcf:Dels>
<vcf:DP>246</vcf:DP>
<vcf:AF>0.500</vcf:AF>
<vcf:MQ0>0</vcf:MQ0>
<vcf:AN>2</vcf:AN>
</vcf:Variant>
<vcf:Genotype rdf:about="urn:genotype/2">
<vcf:sample rdf:resource="urn:sample/V2528"/>
<vcf:variant rdf:resource="urn:variant/1"/>
<rdf:type rdf:resource="urn:genotype/het"/>
<vcf:allele>C</vcf:allele>
<vcf:allele>T</vcf:allele>
<vcf:dp rdf:datatype="xsd:int">229</vcf:dp>
<vcf:gq rdf:datatype="xsd:int">99</vcf:gq>
<vcf:pl>1340,0,5602</vcf:pl>
</vcf:Genotype>
<vcf:Variant rdf:about="urn:variant/3">
<vcf:chromosome rdf:resource="urn:chromosome/8"/>
<vcf:start rdf:datatype="xsd:int">10467576</vcf:start>
<vcf:end rdf:datatype="xsd:int">10467576</vcf:end>
<vcf:ref>T</vcf:ref>
<vcf:alt>T</vcf:alt>
<vcf:alt>C</vcf:alt>
<vcf:qual rdf:datatype="xsd:double">624.77</vcf:qual>
<vcf:BaseQRankSum>-4.704</vcf:BaseQRankSum>
<vcf:HaplotypeScore>415.3829</vcf:HaplotypeScore>
<vcf:QD>2.58</vcf:QD>
<vcf:MLEAC>1</vcf:MLEAC>
<vcf:MQ>48.90</vcf:MQ>
<vcf:MLEAF>0.500</vcf:MLEAF>
<vcf:AC>1</vcf:AC>
<vcf:FS>26.182</vcf:FS>
<vcf:MQRankSum>4.246</vcf:MQRankSum>
<vcf:ReadPosRankSum>0.781</vcf:ReadPosRankSum>
<vcf:Dels>0.00</vcf:Dels>
<vcf:DP>242</vcf:DP>
<vcf:AF>0.500</vcf:AF>
<vcf:MQ0>0</vcf:MQ0>
<vcf:AN>2</vcf:AN>
</vcf:Variant>
<vcf:Genotype rdf:about="urn:genotype/4">
<vcf:sample rdf:resource="urn:sample/V2528"/>
<vcf:variant rdf:resource="urn:variant/3"/>
<rdf:type rdf:resource="urn:genotype/het"/>
<vcf:allele>T</vcf:allele>
<vcf:allele>C</vcf:allele>
<vcf:dp rdf:datatype="xsd:int">230</vcf:dp>
<vcf:gq rdf:datatype="xsd:int">99</vcf:gq>
<vcf:pl>653,0,6159</vcf:pl>
</vcf:Genotype>
<vcf:Variant rdf:about="urn:variant/5">
<vcf:chromosome rdf:resource="urn:chromosome/8"/>
<vcf:start rdf:datatype="xsd:int">10467578</vcf:start>
<vcf:end rdf:datatype="xsd:int">10467581</vcf:end>
<vcf:ref>TTTC</vcf:ref>
<vcf:alt>TTTC</vcf:alt>
<vcf:alt>T</vcf:alt>
<vcf:qual rdf:datatype="xsd:double">2.14748360973E9</vcf:qual>
<vcf:FS>7.426</vcf:FS>
<vcf:AC>1</vcf:AC>
<vcf:BaseQRankSum>-3.400</vcf:BaseQRankSum>
<vcf:MQRankSum>-7.221</vcf:MQRankSum>
<vcf:ReadPosRankSum>-3.139</vcf:ReadPosRankSum>
<vcf:DP>243</vcf:DP>
<vcf:QD>29.69</vcf:QD>
<vcf:AF>0.500</vcf:AF>
<vcf:MQ0>0</vcf:MQ0>
<vcf:MLEAC>1</vcf:MLEAC>
<vcf:MQ>49.83</vcf:MQ>
<vcf:MLEAF>0.500</vcf:MLEAF>
<vcf:AN>2</vcf:AN>
</vcf:Variant>
<vcf:Genotype rdf:about="urn:genotype/6">
<vcf:sample rdf:resource="urn:sample/V2528"/>
<vcf:variant rdf:resource="urn:variant/5"/>
<rdf:type rdf:resource="urn:genotype/het"/>
<vcf:allele>TTTC</vcf:allele>
<vcf:allele>T</vcf:allele>
<vcf:dp rdf:datatype="xsd:int">243</vcf:dp>
<vcf:gq rdf:datatype="xsd:int">99</vcf:gq>
<h:pre><![CDATA[ <vcf:pl>5687,0,7242</vcf:pl>
</vcf:Genotype>
<vcf:Variant rdf:about="urn:variant/7">
<vcf:chromosome rdf:resource="urn:chromosome/8"/>
<vcf:start rdf:datatype="xsd:int">10467588</vcf:start>
<vcf:end rdf:datatype="xsd:int">10467588</vcf:end>
<vcf:ref>T</vcf:ref>
<vcf:alt>T</vcf:alt>
<vcf:alt>C</vcf:alt>
<vcf:qual rdf:datatype="xsd:double">610.77</vcf:qual>
<vcf:BaseQRankSum>5.163</vcf:BaseQRankSum>
<vcf:HaplotypeScore>106.1491</vcf:HaplotypeScore>
<vcf:QD>2.51</vcf:QD>
<vcf:MLEAC>1</vcf:MLEAC>
<vcf:MQ>54.46</vcf:MQ>
<vcf:MLEAF>0.500</vcf:MLEAF>
<vcf:AC>1</vcf:AC>
<vcf:FS>18.558</vcf:FS>
<vcf:MQRankSum>3.886</vcf:MQRankSum>
<vcf:ReadPosRankSum>0.762</vcf:ReadPosRankSum>
<vcf:Dels>0.00</vcf:Dels>
<vcf:DP>243</vcf:DP>
<vcf:AF>0.500</vcf:AF>
<vcf:MQ0>0</vcf:MQ0>
<vcf:AN>2</vcf:AN>
</vcf:Variant>
<vcf:Genotype rdf:about="urn:genotype/8">
<vcf:sample rdf:resource="urn:sample/V2528"/>
<vcf:variant rdf:resource="urn:variant/7"/>
<rdf:type rdf:resource="urn:genotype/het"/>
<vcf:allele>T</vcf:allele>
<vcf:allele>C</vcf:allele>
<vcf:dp rdf:datatype="xsd:int">231</vcf:dp>
<vcf:gq rdf:datatype="xsd:int">99</vcf:gq>
<vcf:pl>639,0,6257</vcf:pl>
</vcf:Genotype>
<vcf:Variant rdf:about="urn:variant/9">
<vcf:chromosome rdf:resource="urn:chromosome/8"/>
<vcf:start rdf:datatype="xsd:int">10467589</vcf:start>
<vcf:end rdf:datatype="xsd:int">10467589</vcf:end>
<vcf:ref>T</vcf:ref>
<vcf:alt>T</vcf:alt>
<vcf:alt>C</vcf:alt>
<vcf:qual rdf:datatype="xsd:double">3705.77</vcf:qual>
<vcf:BaseQRankSum>6.528</vcf:BaseQRankSum>
<vcf:HaplotypeScore>79.8219</vcf:HaplotypeScore>
<vcf:QD>15.19</vcf:QD>
<vcf:MLEAC>1</vcf:MLEAC>
<vcf:MQ>55.35</vcf:MQ>
<vcf:MLEAF>0.500</vcf:MLEAF>
<vcf:AC>1</vcf:AC>
<vcf:FS>19.873</vcf:FS>
<vcf:MQRankSum>2.222</vcf:MQRankSum>
<vcf:ReadPosRankSum>4.388</vcf:ReadPosRankSum>
<vcf:Dels>0.00</vcf:Dels>
<vcf:DP>244</vcf:DP>
<vcf:AF>0.500</vcf:AF>
<vcf:MQ0>0</vcf:MQ0>
<vcf:AN>2</vcf:AN>
</vcf:Variant>
<vcf:Genotype rdf:about="urn:genotype/10">
<vcf:sample rdf:resource="urn:sample/V2528"/>
<vcf:variant rdf:resource="urn:variant/9"/>
<rdf:type rdf:resource="urn:genotype/het"/>
<vcf:allele>T</vcf:allele>
<vcf:allele>C</vcf:allele>
<vcf:dp rdf:datatype="xsd:int">232</vcf:dp>
<vcf:gq rdf:datatype="xsd:int">99</vcf:gq>
<vcf:pl>3734,0,3449</vcf:pl>
</vcf:Genotype>
</rdf:RDF>
```
### Example
load with jena TDB
```
(cd /home/lindenb/src/jvarkit-git/ && make vcf2rdf)
export JENAROOT=/home/lindenb/package/apache-jena-2.11.0
rm -rf TMPTDB
for F in *.vcf.gz
do
gunzip -c ${F} |\
java -jar ${HOME}/src/jvarkit-git/dist/vcf2rdf.jar > tmp.rdf
${JENAROOT}/bin/tdbloader2 --loc=TMPTDB tmp.rdf
du -hs TMPTDB
done
rm tmp.rdf
${JENAROOT}/bin/tdbdump --loc=TMPTDB | head -n 100
loaded 78,258 variants / 20 Mbytes in Jena/TDB 11,193,250 triples / 892Mbytes
```
END_DOC
*/
@Program(name="vcf2rdf",description="convert VCF to RDF (N3 notation)")
public class VcfToRdf extends Launcher
{
private static final Logger LOG = Logger.build(VcfToRdf.class).make();
@Parameter(names={"-o","--output"},description="Output file. Optional . Default: stdout")
private File outputFile = null;
@Parameter(names={"-a","--alleles"},description="print ALT alleles")
private boolean printAlleles = false;
@Parameter(names={"-f","--filters"},description="print FILTERs")
private boolean printFilters = false;
@Parameter(names={"-vep","--vep"},description="print VEP informations")
private boolean printVep = false;
@Parameter(names={"-g","--genotypes"},description="print Genotypes informations")
private boolean printGenotypes = false;
//private long id_generator=0L;
private static final String XSD="http://www.w3.org/2001/XMLSchema#";
private static final String RDF=com.github.lindenb.jvarkit.util.ns.RDF.NS;
private static final String DC="http://purl.org/dc/elements/1.1/";
private static final String NS="http://github.com/lindenb/jvarkit/";
private final Set<String> suffixes=new HashSet<String>();
private PrintWriter w = null;
public VcfToRdf()
{
}
private void prefix(final String pfx,final String uri)
{
this.w.print("@prefix ");
this.w.print(pfx);
this.w.print(": <");
this.w.print(uri);
this.w.println("> .");
this.suffixes.add(pfx);
}
private void emitResource(final Object r)
{
if(r.getClass()==String.class)
{
w.print(String.class.cast(r));
}
else if(r.getClass()==URI.class)
{
w.print("<");
w.print(URI.class.cast(r).toString());
w.print(">");
}
else
{
throw new IllegalStateException(r.getClass().toString());
}
}
private void emitObject(final Object r)
{
if(r.getClass()==String.class)
{
String s=String.class.cast(r);
int colon = s.indexOf(':');
if(colon!=-1 && this.suffixes.contains(s.substring(0, colon)))
{
emitResource(URI.create(s));
}
else
{
w.print("\"");
w.print(s);
w.print("\"");
}
}
else if(r.getClass()==Integer.class)
{
w.print("\"");
w.print(Integer.class.cast(r));
w.print("\"^^xsd:int");
}
else if(r.getClass()==Double.class)
{
w.print("\"");
w.print(Double.class.cast(r));
w.print("\"^^xsd:double");
}
else if(r.getClass()==URI.class)
{
emitResource(r);
}
else
{
throw new IllegalStateException(r.getClass().toString());
}
}
private void emit(URI r,Object...array)
{
if(array.length==0 || array.length%2!=0)
throw new IllegalStateException(String.valueOf(array.length));
boolean subject_printed=false;
for(int i=0;i< array.length;i+=2)
{
if(array[i+0]==null || array[i+1]==null) continue;
if(!subject_printed)
{
emitResource(r);
subject_printed=true;
}
w.print(" ");
emitResource(array[i+0]);
w.print(" ");
emitObject(array[i+1]);
if( i+2 == array.length)
{
w.println(" . ");
}
else
{
w.print(" ; ");
}
}
}
private void writeHeader(final VCFHeader header,final URI source)
{
if(source!=null)
{
emit(source,"rdf:type","vcf:File");
}
final SAMSequenceDictionary dict=header.getSequenceDictionary();
if(dict!=null)
{
for(final SAMSequenceRecord ssr:dict.getSequences())
{
emit(URI.create("urn:chrom/"+ssr.getSequenceName()),
"rdf:type","vcf:Chromosome",
"dc:title",ssr.getSequenceName(),
"vcf:length",ssr.getSequenceLength(),
"vcf:sequenceIndex",ssr.getSequenceIndex()
);
}
}
for(final VCFFilterHeaderLine h:header.getFilterLines())
{
emit(URI.create("urn:filter/"+h.getID()),
"rdf:type","vcf:Filter",
"dc:title",h.getID(),
"dc:description",h.getValue()
);
}
final Pedigree pedigree = Pedigree.readPedigree(header);
for(final Pedigree.Person pe:pedigree.getPersons()) {
final URI sample = URI.create("urn:sample/"+pe.getId());
emit(sample,
"rdf:type","foaf:Person",
"foaf:name",pe.getId(),
"foaf:family",pe.getFamily().getId()
);
if(pe.isMale()) emit(sample,"idt:gender","male");
if(pe.isFemale()) emit(sample,"idt:gender","female");
if(pe.isAffected()) emit(sample,"idt:status","affected");
if(pe.isUnaffected()) emit(sample,"idt:status","unaffected");
}
//Sample
for(final String sample:header.getSampleNamesInOrder())
{
emit(URI.create("urn:sample/"+sample),
"rdf:type","vcf:Sample",
"dc:title",sample
);
}
}
private void scanVCF(final File filein) throws IOException
{
VcfIterator in=null;
URI source=null;
try {
if(filein!=null) source= filein.toURI();
in=(filein==null?VCFUtils.createVcfIteratorStdin():VCFUtils.createVcfIteratorFromFile(filein));
final VCFHeader header = in.getHeader();
final VepPredictionParser vepPredictionParser=new VepPredictionParserFactory(header).get();
writeHeader(header,source);
final SAMSequenceDictionaryProgress progress=new SAMSequenceDictionaryProgress(header);
while(in.hasNext())
{
if(this.w.checkError())
{
LOG.warn("I/O interruption");
break;
}
final VariantContext ctx = progress.watch(in.next());
/* Variant */
final URI variant = URI.create("urn:variant/"+ctx.getContig()+":"+ctx.getStart()+":"+ctx.getReference().getBaseString());
emit(variant,
"rdf:type","vcf:Variant",
"vcf:chrom",URI.create("urn:chrom/"+ctx.getContig()),
"vcf:position",ctx.getStart(),
"vcf:ref",ctx.getReference().getBaseString(),
"vcf:id",(ctx.hasID()?ctx.getID():null),
"vcf:qual",(ctx.hasLog10PError()?ctx.getPhredScaledQual():null)
);
if(this.printAlleles) {
for(final Allele alt: ctx.getAlternateAlleles())
{
emit(variant,"vcf:alt",alt.getBaseString());
}
}
if(this.printFilters) {
for(final String f:ctx.getFilters())
{
emit(variant,"vcf:filter",URI.create("urn:filter/"+f));
}
}
if(this.printVep) {
for(final VepPrediction prediction : vepPredictionParser.getPredictions(ctx))
{
/*
final List<Object> L=new ArrayList<>();
L.add("rdf:type");L.add("vep:Prediction");
L.add("vcf:variant"); L.add(variant);
L.add("vcf:allele");L.add(prediction.getAllele().getBaseString());
for(final SequenceOntologyTree.Term term:prediction.getSOTerms())
{
L.add("vcf:so");
L.add(URI.create(term.getUri()));
}
if(prediction.getEnsemblTranscript()!=null)
{
final URI transcriptid=URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblTranscript());
L.add("vep:transcript");
L.add(transcriptid);
if(prediction.getEnsemblGene()!=null)
{
emit(transcriptid,
"uniprot:transcribedFrom",//used in uniprot dump
URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblGene())
);
}
if(prediction.getEnsemblProtein()!=null)
{
emit(
transcriptid,
"uniprot:translatedTo",//used in uniprot dump
URI.create("http://www.ensembl.org/id/"+prediction.getEnsemblProtein())
);
}
}
emit(
URI.create("urn:vep/"+(++id_generator)),
L.toArray()
);
*/
}
}
if(this.printGenotypes) {
for(final String sample: ctx.getSampleNames())
{
final Genotype g = ctx.getGenotype(sample);
final List<Object> L=new ArrayList<>();
L.add("vcf:sample"); L.add(URI.create("urn:sample/"+sample));
L.add("vcf:variant"); L.add(variant);
L.add("rdf:type");L.add("vcf:Genotype");
if(g.hasDP())
{
L.add("vcf:dp");
L.add(g.getDP());
}
if(g.hasGQ())
{
L.add("vcf:gq");
L.add(g.getGQ());
}
if(g.isCalled())
{
if(g.isHet())
{
if(g.isHetNonRef())
{
L.add("rdf:type");L.add("vcf:HetNonRefGenotype");
}
else
{
L.add("rdf:type");L.add("vcf:HetGenotype");
}
}
else if(g.isHom())
{
if(g.isHomRef())
{
L.add("rdf:type");L.add("vcf:HomRefGenotype");
}
else
{
L.add("rdf:type");L.add("vcf:HomVarGenotype");
}
}
for(final Allele a:g.getAlleles())
{
L.add("vcf:allele");L.add(a.getBaseString());
}
}
emit(
URI.create("urn:gt/"+ctx.getContig()+":"+ctx.getStart()+":"+ctx.getReference().getBaseString()+":"+sample),
L.toArray()
);
}
}
}
in.close(); in = null;
progress.finish();
}
catch (final Exception e) {
throw new IOException(e);
}
finally
{
CloserUtil.close(in);
}
}
@Override
public int doWork(List<String> args) {
try
{
this.w= super.openFileOrStdoutAsPrintWriter(this.outputFile);
prefix("rdf",RDF);
prefix("dc", DC);
prefix("vcf", NS);
prefix("xsd", XSD);
prefix("vep", "http://www.ensembl.org/info/docs/tools/vep/");
prefix("uniprot", "http://purl.uniprot.org/core/");//used in RDF dump of uniprot
//prefix("so", "<http://www.sequenceontology.org/browser/current_svn/term/");
if(args.isEmpty())
{
scanVCF(null);
}
else
{
for(final String file: IOUtils.unrollFiles(args))
{
scanVCF(new File(file));
}
}
w.flush();
w.close();
w=null;
return RETURN_OK;
}
catch(final Exception err)
{
LOG.error(err);
return -1;
}
finally
{
CloserUtil.close(w);
}
}
public static void main(String[] args)
{
new VcfToRdf().instanceMainWithExit(args);
}
}