package org.molgenis.data.annotation.core.entity.impl; import org.molgenis.data.DataService; import org.molgenis.data.Entity; import org.molgenis.data.annotation.core.RepositoryAnnotator; import org.molgenis.data.annotation.core.entity.AnnotatorConfig; import org.molgenis.data.annotation.core.entity.AnnotatorInfo; import org.molgenis.data.annotation.core.entity.EntityAnnotator; import org.molgenis.data.annotation.core.entity.impl.framework.AbstractAnnotator; import org.molgenis.data.annotation.core.entity.impl.framework.RepositoryAnnotatorImpl; import org.molgenis.data.annotation.core.filter.MultiAllelicResultFilter; import org.molgenis.data.annotation.core.query.LocusQueryCreator; import org.molgenis.data.annotation.core.resources.Resource; import org.molgenis.data.annotation.core.resources.Resources; import org.molgenis.data.annotation.core.resources.impl.RepositoryFactory; import org.molgenis.data.annotation.core.resources.impl.ResourceImpl; import org.molgenis.data.annotation.core.resources.impl.SingleResourceConfig; import org.molgenis.data.annotation.core.resources.impl.tabix.TabixRepositoryFactory; import org.molgenis.data.annotation.web.settings.CaddAnnotatorSettings; import org.molgenis.data.annotation.web.settings.SingleFileLocationCmdLineAnnotatorSettingsConfigurer; import org.molgenis.data.meta.model.Attribute; import org.molgenis.data.meta.model.AttributeFactory; import org.molgenis.data.meta.model.EntityType; import org.molgenis.data.meta.model.EntityTypeFactory; import org.molgenis.data.vcf.model.VcfAttributes; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import java.util.ArrayList; import java.util.List; import static org.molgenis.data.meta.AttributeType.STRING; @Configuration public class CaddAnnotator implements AnnotatorConfig { public static final String NAME = "cadd"; // FIXME: nomenclature: http://cadd.gs.washington.edu/info public static final String CADD_SCALED = "CADD_SCALED"; public static final String CADD_ABS = "CADD"; public static final String CADD_SCALED_LABEL = "CADDSCALED"; public static final String CADD_ABS_LABEL = "CADDABS"; public static final String CADD_TABIX_RESOURCE = "CADDTabixResource"; @Autowired private Entity caddAnnotatorSettings; @Autowired private Resources resources; @Autowired private DataService dataService; @Autowired private VcfAttributes vcfAttributes; @Autowired private EntityTypeFactory entityTypeFactory; @Autowired private AttributeFactory attributeFactory; private RepositoryAnnotatorImpl annotator; @Bean public RepositoryAnnotator cadd() { annotator = new RepositoryAnnotatorImpl(NAME); return annotator; } @Override public void init() { List<Attribute> attributes = createCaddAnnotatorAttributes(); AnnotatorInfo caddInfo = AnnotatorInfo .create(AnnotatorInfo.Status.READY, AnnotatorInfo.Type.PATHOGENICITY_ESTIMATE, NAME, "CADD is a tool for scoring the deleteriousness of single nucleotide variants as well as insertion/deletions variants in the human genome.\n" + "While many variant annotation and scoring utils are around, most annotations tend to exploit a single information type (e.g. conservation) " + "and/or are restricted in scope (e.g. to missense changes). " + "Thus, a broadly applicable metric that objectively weights and integrates diverse information is needed. " + "Combined Annotation Dependent Depletion (CADD) is a framework that integrates multiple " + "annotations into one metric by contrasting variants that survived natural selection with simulated mutations.\n" + "C-scores strongly correlate with allelic diversity, pathogenicity of both coding and non-coding variants, and experimentally measured " + "regulatory effects, and also highly rank causal variants within " + "individual genome sequences. Finally, C-scores of complex trait-associated variants from genome-wide association studies (GWAS) are " + "significantly higher than matched controls and correlate with study sample size, likely reflecting the increased accuracy of larger GWAS.\n" + "CADD can quantitatively prioritize functional, deleterious, and disease causal variants across a wide range of functional categories, " + "effect sizes and genetic architectures and can be used prioritize " + "causal variation in both research and clinical settings. (source: http://cadd.gs.washington.edu/info)", attributes); EntityAnnotator entityAnnotator = new AbstractAnnotator(CADD_TABIX_RESOURCE, caddInfo, new LocusQueryCreator(vcfAttributes), new MultiAllelicResultFilter(attributes, true, vcfAttributes), dataService, resources, new SingleFileLocationCmdLineAnnotatorSettingsConfigurer(CaddAnnotatorSettings.Meta.CADD_LOCATION, caddAnnotatorSettings)) { @Override public List<Attribute> createAnnotatorAttributes(AttributeFactory attributeFactory) { return createCaddAnnotatorAttributes(); } }; annotator.init(entityAnnotator); } private List<Attribute> createCaddAnnotatorAttributes() { List<Attribute> attributes = new ArrayList<>(); Attribute cadd_abs = createCaddAbsAttr(attributeFactory); Attribute cadd_scaled = createCaddScaledAttr(attributeFactory); attributes.add(cadd_abs); attributes.add(cadd_scaled); return attributes; } public static Attribute createCaddScaledAttr(AttributeFactory attributeFactory) { return attributeFactory.create().setName(CADD_SCALED).setDataType(STRING).setDescription( "Since the raw scores do have relative meaning, one can take a specific group of variants, define the rank for each variant within that group, and then use " + "that value as a \"normalized\" and now externally comparable unit of analysis. In our case, we scored and ranked all ~8.6 billion SNVs of the " + "GRCh37/hg19 reference and then \"PHRED-scaled\" those values by expressing the rank in order of magnitude terms rather than the precise rank itself. " + "For example, reference genome single nucleotide variants at the 10th-% of CADD scores are assigned to CADD-10, top 1% to CADD-20, top 0.1% to CADD-30, etc. " + "The results of this transformation are the \"scaled\" CADD scores.(source: http://cadd.gs.washington.edu/info)") .setLabel(CADD_SCALED_LABEL); } static Attribute createCaddAbsAttr(AttributeFactory attributeFactory) { return attributeFactory.create().setName(CADD_ABS).setDataType(STRING).setDescription( "\"Raw\" CADD scores come straight from the model, and are interpretable as the extent to which the annotation profile for a given variant suggests that " + "that variant is likely to be \"observed\" (negative values) vs \"simulated\" (positive values). These values have no absolute unit of meaning and are " + "incomparable across distinct annotation combinations, training sets, or model parameters. However, raw values do have relative meaning, with higher values " + "indicating that a variant is more likely to be simulated (or \"not observed\") and therefore more likely to have deleterious effects." + "(source: http://cadd.gs.washington.edu/info)").setLabel(CADD_ABS_LABEL); } @Bean Resource caddResource() { return new ResourceImpl(CADD_TABIX_RESOURCE, new SingleResourceConfig(CaddAnnotatorSettings.Meta.CADD_LOCATION, caddAnnotatorSettings)) { @Override public RepositoryFactory getRepositoryFactory() { String idAttrName = "id"; EntityType repoMetaData = entityTypeFactory.create().setName(CADD_TABIX_RESOURCE); repoMetaData.addAttribute(vcfAttributes.getChromAttribute()); repoMetaData.addAttribute(vcfAttributes.getPosAttribute()); repoMetaData.addAttribute(vcfAttributes.getRefAttribute()); repoMetaData.addAttribute(vcfAttributes.getAltAttribute()); repoMetaData.addAttribute(attributeFactory.create().setName(CADD_ABS).setDataType(STRING)); repoMetaData.addAttribute(attributeFactory.create().setName(CADD_SCALED).setDataType(STRING)); Attribute idAttribute = attributeFactory.create().setName(idAttrName).setVisible(false) .setIdAttribute(true); repoMetaData.addAttribute(idAttribute); return new TabixRepositoryFactory(repoMetaData); } }; } }