package org.molgenis.data.annotation.core.entity.impl;
import org.molgenis.data.DataService;
import org.molgenis.data.Entity;
import org.molgenis.data.annotation.core.RepositoryAnnotator;
import org.molgenis.data.annotation.core.entity.AnnotatorConfig;
import org.molgenis.data.annotation.core.entity.AnnotatorInfo;
import org.molgenis.data.annotation.core.entity.EntityAnnotator;
import org.molgenis.data.annotation.core.entity.impl.framework.AbstractAnnotator;
import org.molgenis.data.annotation.core.entity.impl.framework.RepositoryAnnotatorImpl;
import org.molgenis.data.annotation.core.filter.MultiAllelicResultFilter;
import org.molgenis.data.annotation.core.query.LocusQueryCreator;
import org.molgenis.data.annotation.core.resources.Resource;
import org.molgenis.data.annotation.core.resources.Resources;
import org.molgenis.data.annotation.core.resources.impl.RepositoryFactory;
import org.molgenis.data.annotation.core.resources.impl.ResourceImpl;
import org.molgenis.data.annotation.core.resources.impl.SingleResourceConfig;
import org.molgenis.data.annotation.core.resources.impl.tabix.TabixRepositoryFactory;
import org.molgenis.data.annotation.web.settings.SingleFileLocationCmdLineAnnotatorSettingsConfigurer;
import org.molgenis.data.meta.model.Attribute;
import org.molgenis.data.meta.model.AttributeFactory;
import org.molgenis.data.meta.model.EntityType;
import org.molgenis.data.meta.model.EntityTypeFactory;
import org.molgenis.data.vcf.model.VcfAttributes;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import java.util.List;
import static com.google.common.collect.Lists.newArrayList;
import static org.molgenis.data.annotation.web.settings.FitConAnnotatorSettings.Meta.FITCON_LOCATION;
import static org.molgenis.data.meta.AttributeType.STRING;
@Configuration
public class FitConAnnotator implements AnnotatorConfig
{
public static final String NAME = "fitcon";
public static final String FITCON_SCORE = "FITCON_SCORE";
public static final String FITCON_SCORE_LABEL = "FITCON_SCORE";
public static final String FITCON_TABIX_RESOURCE = "FitConTabixResource";
@Autowired
private Entity fitConAnnotatorSettings;
@Autowired
private DataService dataService;
@Autowired
private Resources resources;
@Autowired
private VcfAttributes vcfAttributes;
@Autowired
private EntityTypeFactory entityTypeFactory;
@Autowired
private AttributeFactory attributeFactory;
private RepositoryAnnotatorImpl annotator;
@Bean
public RepositoryAnnotator fitcon()
{
annotator = new RepositoryAnnotatorImpl(NAME);
return annotator;
}
@Override
public void init()
{
List<Attribute> attributes = createFitconOutputAttributes();
AnnotatorInfo fitconInfo = AnnotatorInfo
.create(AnnotatorInfo.Status.READY, AnnotatorInfo.Type.EFFECT_PREDICTION, NAME,
"Summary: Annotating genetic variants, especially non-coding variants, "
+ "for the purpose of identifying pathogenic variants remains a challenge. "
+ "Combined annotation-dependent depletion (CADD) is an al- gorithm designed "
+ "to annotate both coding and non-coding variants, and has been shown to "
+ "outper- form other annotation algorithms. CADD trains a linear kernel support"
+ " vector machine (SVM) to dif- ferentiate evolutionarily derived, likely benign,"
+ " alleles from simulated, likely deleterious, variants. However, SVMs cannot "
+ "capture non-linear relationships among the features, which can limit per- formance. "
+ "To address this issue, we have developed FITCON. FITCON uses the same feature set and "
+ "training data as CADD to train a deep neural network (DNN). DNNs can capture non-linear"
+ " relation- ships among features and are better suited than SVMs for problems with a "
+ "large number of samples and features. We exploit Compute Unified Device Architecture-compatible"
+ " graphics processing units and deep learning techniques such as dropout and momentum training to"
+ " accelerate the DNN train- ing. FITCON achieves about a 19%relative reduction in the error rate and"
+ " about a 14%relative increase in the area under the curve (AUC) metric over CADD’s SVMmethodology."
+ " All data and source code are available at https://cbcl.ics.uci.edu/ public_data/FITCON/. Contact:",
attributes);
EntityAnnotator entityAnnotator = new AbstractAnnotator(FITCON_TABIX_RESOURCE, fitconInfo,
new LocusQueryCreator(vcfAttributes), new MultiAllelicResultFilter(attributes, vcfAttributes),
dataService, resources,
new SingleFileLocationCmdLineAnnotatorSettingsConfigurer(FITCON_LOCATION, fitConAnnotatorSettings))
{
@Override
public List<Attribute> createAnnotatorAttributes(AttributeFactory attributeFactory)
{
return createFitconOutputAttributes();
}
};
annotator.init(entityAnnotator);
}
private List<Attribute> createFitconOutputAttributes()
{
List<Attribute> attributes = newArrayList();
Attribute fitcon_score = attributeFactory.create().setName(FITCON_SCORE).setDataType(STRING)
.setDescription("fitness consequence score annotation of genetic variants using Fitcon scoring.")
.setLabel(FITCON_SCORE_LABEL);
attributes.add(fitcon_score);
return attributes;
}
@Bean
Resource fitconResource()
{
Resource fitConTabixResource;
fitConTabixResource = new ResourceImpl(FITCON_TABIX_RESOURCE,
new SingleResourceConfig(FITCON_LOCATION, fitConAnnotatorSettings))
{
@Override
public RepositoryFactory getRepositoryFactory()
{
EntityType repoMetaData = entityTypeFactory.create().setName(FITCON_TABIX_RESOURCE);
repoMetaData.addAttribute(vcfAttributes.getChromAttribute());
repoMetaData.addAttribute(vcfAttributes.getPosAttribute());
repoMetaData.addAttribute(vcfAttributes.getRefAttribute());
repoMetaData.addAttribute(attributeFactory.create().setName("Anc"));
repoMetaData.addAttribute(vcfAttributes.getAltAttribute());
repoMetaData.addAttribute(attributeFactory.create().setName("Type"));
repoMetaData.addAttribute(attributeFactory.create().setName("Length"));
repoMetaData.addAttribute(attributeFactory.create().setName("isTv"));
repoMetaData.addAttribute(attributeFactory.create().setName("isDerived"));
repoMetaData.addAttribute(attributeFactory.create().setName("AnnoType"));
repoMetaData.addAttribute(attributeFactory.create().setName("Consequence"));
repoMetaData.addAttribute(attributeFactory.create().setName("ConsScore"));
repoMetaData.addAttribute(attributeFactory.create().setName("ConsDetail"));
repoMetaData.addAttribute(attributeFactory.create().setName("GC"));
repoMetaData.addAttribute(attributeFactory.create().setName("CpG"));
repoMetaData.addAttribute(attributeFactory.create().setName("mapAbility20bp"));
repoMetaData.addAttribute(attributeFactory.create().setName("mapAbility35bp"));
repoMetaData.addAttribute(attributeFactory.create().setName("scoreSegDup"));
repoMetaData.addAttribute(attributeFactory.create().setName("priPhCons"));
repoMetaData.addAttribute(attributeFactory.create().setName("mamPhCons"));
repoMetaData.addAttribute(attributeFactory.create().setName("verPhCons"));
repoMetaData.addAttribute(attributeFactory.create().setName("priPhyloP"));
repoMetaData.addAttribute(attributeFactory.create().setName("mamPhyloP"));
repoMetaData.addAttribute(attributeFactory.create().setName("verPhyloP"));
repoMetaData.addAttribute(attributeFactory.create().setName("GerpN"));
repoMetaData.addAttribute(attributeFactory.create().setName("GerpS"));
repoMetaData.addAttribute(attributeFactory.create().setName("GerpRS"));
repoMetaData.addAttribute(attributeFactory.create().setName("GerpRSpval"));
repoMetaData.addAttribute(attributeFactory.create().setName("bStatistic"));
repoMetaData.addAttribute(attributeFactory.create().setName("mutIndex"));
repoMetaData.addAttribute(attributeFactory.create().setName("dnaHelT"));
repoMetaData.addAttribute(attributeFactory.create().setName("dnaMGW"));
repoMetaData.addAttribute(attributeFactory.create().setName("dnaProT"));
repoMetaData.addAttribute(attributeFactory.create().setName("dnaRoll"));
repoMetaData.addAttribute(attributeFactory.create().setName("mirSVR-Score"));
repoMetaData.addAttribute(attributeFactory.create().setName("mirSVR-E"));
repoMetaData.addAttribute(attributeFactory.create().setName("mirSVR-Aln"));
repoMetaData.addAttribute(attributeFactory.create().setName("targetScan"));
// fitcons can be NA so we need to catch the string value
repoMetaData.addAttribute(attributeFactory.create().setName("FITCON_SCORE"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmTssA"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmTssAFlnk"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmTxFlnk"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmTx"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmTxWk"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmEnhG"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmEnh"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmZnfRpts"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmHet"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmTssBiv"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmBivFlnk"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmEnhBiv"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmReprPC"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmReprPCWk"));
repoMetaData.addAttribute(attributeFactory.create().setName("cHmmQuies"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncExp"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncH3K27Ac"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncH3K4Me1"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncH3K4Me3"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncNucleo"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncOCC"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncOCCombPVal"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncOCDNasePVal"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncOCFairePVal"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncOCpolIIPVal"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncOCctcfPVal"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncOCmycPVal"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncOCDNaseSig"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncOCFaireSig"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncOCpolIISig"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncOCctcfSig"));
repoMetaData.addAttribute(attributeFactory.create().setName("EncOCmycSig"));
repoMetaData.addAttribute(attributeFactory.create().setName("Segway"));
repoMetaData.addAttribute(attributeFactory.create().setName("tOverlapMotifs"));
repoMetaData.addAttribute(attributeFactory.create().setName("motifDist"));
repoMetaData.addAttribute(attributeFactory.create().setName("motifECount"));
repoMetaData.addAttribute(attributeFactory.create().setName("motifEName"));
repoMetaData.addAttribute(attributeFactory.create().setName("motifEHIPos"));
repoMetaData.addAttribute(attributeFactory.create().setName("motifEScoreChng"));
repoMetaData.addAttribute(attributeFactory.create().setName("TFBS"));
repoMetaData.addAttribute(attributeFactory.create().setName("TFBSPeaks"));
repoMetaData.addAttribute(attributeFactory.create().setName("TFBSPeaksMax"));
repoMetaData.addAttribute(attributeFactory.create().setName("isKnownVariant"));
repoMetaData.addAttribute(attributeFactory.create().setName("ESP_AF"));
repoMetaData.addAttribute(attributeFactory.create().setName("ESP_AFR"));
repoMetaData.addAttribute(attributeFactory.create().setName("ESP_EUR"));
repoMetaData.addAttribute(attributeFactory.create().setName("TG_AF"));
repoMetaData.addAttribute(attributeFactory.create().setName("TG_ASN"));
repoMetaData.addAttribute(attributeFactory.create().setName("TG_AMR"));
repoMetaData.addAttribute(attributeFactory.create().setName("TG_AFR"));
repoMetaData.addAttribute(attributeFactory.create().setName("TG_EUR"));
repoMetaData.addAttribute(attributeFactory.create().setName("minDistTSS"));
repoMetaData.addAttribute(attributeFactory.create().setName("minDistTSE"));
repoMetaData.addAttribute(attributeFactory.create().setName("GeneID"));
repoMetaData.addAttribute(attributeFactory.create().setName("FeatureID"));
repoMetaData.addAttribute(attributeFactory.create().setName("CCDS"));
repoMetaData.addAttribute(attributeFactory.create().setName("GeneName"));
repoMetaData.addAttribute(attributeFactory.create().setName("cDNApos"));
repoMetaData.addAttribute(attributeFactory.create().setName("relcDNApos"));
repoMetaData.addAttribute(attributeFactory.create().setName("CDSpos"));
repoMetaData.addAttribute(attributeFactory.create().setName("relCDSpos"));
repoMetaData.addAttribute(attributeFactory.create().setName("protPos"));
repoMetaData.addAttribute(attributeFactory.create().setName("relProtPos"));
repoMetaData.addAttribute(attributeFactory.create().setName("Domain"));
repoMetaData.addAttribute(attributeFactory.create().setName("Dst2Splice"));
repoMetaData.addAttribute(attributeFactory.create().setName("Dst2SplType"));
repoMetaData.addAttribute(attributeFactory.create().setName("Exon"));
repoMetaData.addAttribute(attributeFactory.create().setName("Intron"));
repoMetaData.addAttribute(attributeFactory.create().setName("oAA"));
repoMetaData.addAttribute(attributeFactory.create().setName("nAA"));
repoMetaData.addAttribute(attributeFactory.create().setName("Grantham"));
repoMetaData.addAttribute(attributeFactory.create().setName("PolyPhenCat"));
repoMetaData.addAttribute(attributeFactory.create().setName("PolyPhenVal"));
repoMetaData.addAttribute(attributeFactory.create().setName("SIFTcat"));
repoMetaData.addAttribute(attributeFactory.create().setName("SIFTval"));
repoMetaData.addAttribute(attributeFactory.create().setName("RawScore"));
repoMetaData.addAttribute(attributeFactory.create().setName("PHRED"));
Attribute idAttribute = attributeFactory.create().setName("id").setVisible(false).setIdAttribute(true);
repoMetaData.addAttribute(idAttribute);
return new TabixRepositoryFactory(repoMetaData);
}
};
return fitConTabixResource;
}
}