/*
The MIT License (MIT)
Copyright (c) 2016 Pierre Lindenbaum
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
History:
* 2016 creation
*/
package com.github.lindenb.jvarkit.tools.vcfeigen;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import com.github.lindenb.jvarkit.util.vcf.InfoAnnotator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.CloseableTribbleIterator;
import htsjdk.tribble.Feature;
import htsjdk.tribble.TabixFeatureReader;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.tribble.readers.PositionalBufferedStream;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
public class EigenInfoAnnotator implements InfoAnnotator
{
private static class KeyAndType
{
final String key;
final VCFHeaderLineCount count;
final VCFHeaderLineType type;
KeyAndType(final String key,final VCFHeaderLineCount count,VCFHeaderLineType type)
{
this.key=key;
this.count=count;
this.type=type;
}
KeyAndType(final String key)
{
this(key,VCFHeaderLineCount.A,VCFHeaderLineType.Float);
}
}
public static abstract class AbstractFeature implements Feature
{
final String contig;
final int pos;
final Allele ref;
final Allele alt;
final String tokens[];
private AbstractFeature(final String tokens[]) {
this.contig = tokens[0];
this.pos = Integer.parseInt(tokens[1]);
this.ref = Allele.create(tokens[2], true);
this.alt = Allele.create(tokens[3], false);
this.tokens=tokens;
}
boolean accept(final VariantContext ctx) {
int C1 = contig2eigen(ctx.getContig());
int C2 = contig2eigen(this.contig);
if(C1==-1 || C2==-1 || C1!=C2) return false;
if(!ctx.getReference().equals(this.ref)) return false;
if(!ctx.hasAllele(this.alt)) return false;
return true;
}
@Override
public String getChr()
{
return contig;
}
@Override
public String getContig()
{
return contig;
}
@Override
public int getStart()
{
return pos;
}
@Override
public int getEnd()
{
return pos;
}
String get(int i) {
return i>=0 && i< this.tokens.length?tokens[i]:null;
}
@Override
public String toString() {
return Arrays.toString(this.tokens);
}
}
public static class CodingFeature extends AbstractFeature
{
private CodingFeature(final String tokens[]) {
super(tokens);
}
}
public static class NonCodingFeature extends AbstractFeature
{
private NonCodingFeature(final String tokens[]) {
super(tokens);
}
}
public static abstract class AbstractFeatureCodec<T extends AbstractFeature> extends AsciiFeatureCodec<T>
{
protected final Pattern tab=Pattern.compile("[\t]");
protected AbstractFeatureCodec(Class<T> C){
super(C);
}
@Override
public boolean canDecode(final String path)
{
return path.endsWith(".tab.gz");
}
@Override
public Object readActualHeader(LineIterator arg0)
{
return null;
}
}
/* public because http://gatkforums.broadinstitute.org/gatk/discussion/8851/ */
public static class CodingFeatureCodec extends AbstractFeatureCodec<CodingFeature>
{
public CodingFeatureCodec() {
super(CodingFeature.class);
}
@Override
public CodingFeature decode(final String line)
{
if(line.startsWith("chr")) return null;
return new CodingFeature(tab.split(line));
}
}
/* public because http://gatkforums.broadinstitute.org/gatk/discussion/8851/ */
public static class NonCodingFeatureCodec extends AbstractFeatureCodec<NonCodingFeature>
{
public NonCodingFeatureCodec() {
super(NonCodingFeature.class);
}
@Override
public NonCodingFeature decode(String line)
{
if(line.startsWith("chr")) return null;
return new NonCodingFeature(tab.split(line));
}
}
private File eigenDirectory = null;
private final List<VCFInfoHeaderLine> noncodingheaderlines = new ArrayList<>();
private final List<VCFInfoHeaderLine> codingheaderlines = new ArrayList<>();
private TabixFeatureReader<NonCodingFeature, PositionalBufferedStream> nonCodingFeatureReader = null;
private TabixFeatureReader<CodingFeature, PositionalBufferedStream> codingFeatureReader = null;
private int prev_contig=-1;
public EigenInfoAnnotator(final File dir) {
this.eigenDirectory = dir;
IOUtil.assertDirectoryIsReadable(dir);
//this.noncodingheaderlines.add(new VCFInfoHeaderLine(prefix+"chr", VCFHeaderLineCount.A, VCFHeaderLineType.String, "chr"));
//this.noncodingheaderlines.add(new VCFInfoHeaderLine(prefix+"position", VCFHeaderLineCount.A, VCFHeaderLineType.String, "position"));
//this.noncodingheaderlines.add(new VCFInfoHeaderLine(prefix+"ref", VCFHeaderLineCount.A, VCFHeaderLineType.String, "ref"));
//this.noncodingheaderlines.add(new VCFInfoHeaderLine(prefix+"alt", VCFHeaderLineCount.A, VCFHeaderLineType.String, "alt"));
for(final KeyAndType key : new KeyAndType[]{
new KeyAndType("GERP_NR"), new KeyAndType("GERP_RS"), new KeyAndType("PhyloPri"), new KeyAndType("PhyloPla"),
new KeyAndType("PhyloVer"), new KeyAndType("PhastPri"), new KeyAndType("PhastPla"), new KeyAndType("PhastVer"),
new KeyAndType("H3K4Me1"), new KeyAndType("H3K4Me3"), new KeyAndType("H3K27ac"), new KeyAndType("TFBS_max"),
new KeyAndType("TFBS_sum"), new KeyAndType("TFBS_num"), new KeyAndType("OCPval"), new KeyAndType("DnaseSig"),
new KeyAndType("DnasePval"), new KeyAndType("FaireSig"), new KeyAndType("FairePval"), new KeyAndType("PolIISig"),
new KeyAndType("PolIIPval"), new KeyAndType("ctcfSig"), new KeyAndType("ctcfPval"), new KeyAndType("cmycSig"),
new KeyAndType("cmycPval"), new KeyAndType("Eigen-raw"), new KeyAndType("Eigen-phred"), new KeyAndType("Eigen-PC-raw"),
new KeyAndType("Eigen-PC-phred")}
)
{
final String prefix="EIGEN_NC_";
final VCFInfoHeaderLine vihl = new VCFInfoHeaderLine(
prefix+key.key.replace('-', '_'),
key.count,
key.type,
key.key+ " (Non-Coding)"
);
this.noncodingheaderlines.add(vihl);
}
for(final KeyAndType key : new KeyAndType[]{
new KeyAndType("SIFT"), new KeyAndType("PolyPhenDIV"), new KeyAndType("PolyPhenVar"), new KeyAndType("MA"),
new KeyAndType("GERP_NR"), new KeyAndType("GERP_RS"), new KeyAndType("PhyloPri"), new KeyAndType("PhyloPla"),
new KeyAndType("PhyloVer"), new KeyAndType("PhastPri"), new KeyAndType("PhastPla"), new KeyAndType("PhastVer"),
new KeyAndType("Consequence",VCFHeaderLineCount.A,VCFHeaderLineType.String),
new KeyAndType("Eigen-raw"), new KeyAndType("Eigen-phred"), new KeyAndType("Eigen-PC-raw"),
new KeyAndType("Eigen-PC-phred")
}
)
{
final String prefix="EIGEN_CODING_";
final VCFInfoHeaderLine vihl = new VCFInfoHeaderLine(
prefix+key.key.replace('-', '_'),
key.count,
key.type,
key.key+ " (Coding)"
);
this.codingheaderlines.add(vihl);
}
}
private static int contig2eigen(String chr) {
if(chr.toLowerCase().startsWith("chr")) chr=chr.substring(3);
if(chr.isEmpty() || !Character.isDigit(chr.charAt(0))) return -1;
try {
int c=Integer.parseInt(chr.trim());
if(c<1 || c>22) return -1;
return c;
}
catch(NumberFormatException err) {
return -1;
}
}
private String tabixPrefix="Eigen_hg19_";
public void setTabixFilePrefix(String prefix) {
this.tabixPrefix = prefix;
}
public String getTabixPrefix() {
return tabixPrefix;
}
private File getNonCodingFileForContig(final int C)
{
if(this.eigenDirectory==null) throw new IllegalStateException("Eigein directory was not defined");
return new File(this.eigenDirectory,getTabixPrefix()+"noncoding_annot_chr"+C+".tab.bgz");
}
@Override
public String getName()
{
return "Eigen";
}
@Override
public String getDescription()
{
return "Annotator for the data of https://xioniti01.u.hpc.mssm.edu/v1.1/ : Eigen makes use of a variety of functional annotations in both coding and noncoding regions (such as made available by the ENCODE and Roadmap Epigenomics projects), and combines them into one single measure of functional importance.";
}
@Override
public Set<VCFInfoHeaderLine> getInfoHeaderLines()
{
final Set<VCFInfoHeaderLine> set = new HashSet<>(this.noncodingheaderlines);
set.addAll(this.codingheaderlines);
return set;
}
@Override
public Map<String, Object> getAnnotations(final VariantContext ctx)
{
if(!ctx.isVariant()) return Collections.emptyMap();
final int contig = contig2eigen(ctx.getContig());
if( contig < 1) return Collections.emptyMap();
try {
if(this.codingFeatureReader == null) {
this.codingFeatureReader = new TabixFeatureReader<>(
new File(this.eigenDirectory,getTabixPrefix()+"coding_annot_04092016.tab.bgz").getPath(),
new CodingFeatureCodec());
}
if( this.prev_contig==-1 || prev_contig!=contig)
{
CloserUtil.close(this.nonCodingFeatureReader);
this.nonCodingFeatureReader = new TabixFeatureReader<>(
getNonCodingFileForContig(contig).getPath(),
new NonCodingFeatureCodec());
this.prev_contig = contig;
}
final Map<Allele,NonCodingFeature> alt2nonCoding= new HashMap<>();
CloseableTribbleIterator<NonCodingFeature> iter1= this.nonCodingFeatureReader.query(
String.valueOf(contig), ctx.getStart(),ctx.getEnd());
while(iter1.hasNext()) {
NonCodingFeature feat = iter1.next();
if(feat==null || !feat.accept(ctx)) continue;
alt2nonCoding.put(feat.alt, feat);
}
iter1.close();
final Map<Allele,CodingFeature> alt2coding= new HashMap<>();
CloseableTribbleIterator<CodingFeature> iter2= this.codingFeatureReader.query(
String.valueOf(contig), ctx.getStart(),ctx.getEnd());
while(iter2.hasNext()) {
CodingFeature feat = iter2.next();
if(feat==null || !feat.accept(ctx)) continue;
alt2coding.put(feat.alt, feat);
}
iter2.close();
if( alt2nonCoding.isEmpty() && alt2coding.isEmpty() ) return Collections.emptyMap();
final List<Allele> alternateAlleles = ctx.getAlternateAlleles();
final Map<String, Object> map =new HashMap<>();
for(int side=0;side<2;++side)
{
for(int i=0;i< (side==0?noncodingheaderlines.size():codingheaderlines.size());++i)
{
final VCFInfoHeaderLine vihl = (side==0?noncodingheaderlines.get(i):codingheaderlines.get(i));
final List<Object> atts = new ArrayList<>(alternateAlleles.size());
boolean found_one=false;
for(int altn=0;altn<alternateAlleles.size();++altn)
{
final Allele alt= alternateAlleles.get(altn);
final AbstractFeature feat = (side==0?alt2nonCoding.get(alt):alt2coding.get(alt));
if( feat == null) {
atts.add(".");
continue;
}
final String token = feat.get(4+i);
if(token==null || token.isEmpty() || token.equals("."))
{
atts.add(".");
continue;
}
else if( vihl.getType()==VCFHeaderLineType.String) {
found_one=true;
atts.add(token.replace(',', '|'));
}
else
{
try {
Float dbl = new Float(token);
atts.add(dbl);
found_one=true;
} catch (NumberFormatException err) {
throw new IOException("Cannot cast "+token+" to float for "+vihl);
}
}
}
if(!found_one)
{
//nothing
}
else if(vihl.getCountType()==VCFHeaderLineCount.R)
{
map.put(vihl.getID(),new ArrayList<>(new LinkedHashSet<>(atts)));
}
else
{
if(atts.size()!=ctx.getAlternateAlleles().size()) {
System.err.println("Number of eigen data!=number of ALTS");
}
map.put(vihl.getID(), atts);
}
}
}
return map;
}
catch(final IOException err) {
throw new RuntimeException(err);
}
}
@Override
public void close()
{
prev_contig=-1;
CloserUtil.close(this.nonCodingFeatureReader);
CloserUtil.close(this.codingFeatureReader);
this.nonCodingFeatureReader=null;
this.codingFeatureReader=null;
}
}