package org.opencb.opencga.storage.core.search;
import org.apache.commons.lang3.StringUtils;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.annotation.ConsequenceTypeMappings;
import org.opencb.biodata.models.variant.avro.*;
import org.opencb.biodata.models.variant.stats.VariantStats;
import org.opencb.commons.datastore.core.ComplexTypeConverter;
import java.util.*;
/**
* Created by wasim on 14/11/16.
*/
public class VariantSearchToVariantConverter implements ComplexTypeConverter<Variant, VariantSearchModel> {
public static final double MISSING_VALUE = -100.0;
/**
* Conversion: from storage type to data model.
*
* @param variantSearchModel Storage type object
* @return Data model object
*/
@Override
public Variant convertToDataModelType(VariantSearchModel variantSearchModel) {
// set chromosome, start, end, ref, alt from ID
Variant variant = new Variant(variantSearchModel.getId());
// set ID, chromosome, start, end, ref, alt, type
variant.setId(variantSearchModel.getVariantId());
// set variant type
variant.setType(VariantType.valueOf(variantSearchModel.getType()));
// set studies and stats
Map<String, StudyEntry> studyEntryMap = new HashMap<>();
if (variantSearchModel.getStudies() != null && variantSearchModel.getStudies().size() > 0) {
List<StudyEntry> studies = new ArrayList<>();
variantSearchModel.getStudies().forEach(s -> {
StudyEntry entry = new StudyEntry();
entry.setStudyId(s);
studies.add(entry);
studyEntryMap.put(s, entry);
});
variant.setStudies(studies);
}
if (variantSearchModel.getStats() != null && variantSearchModel.getStats().size() > 0) {
for (String key: variantSearchModel.getStats().keySet()) {
// key consists of 'stats' + "__" + studyId + "__" + cohort
String[] fields = key.split("__");
if (fields[1].contains("_")) {
String[] split = fields[1].split("_");
fields[1] = split[split.length - 1];
}
if (studyEntryMap.containsKey(fields[1])) {
VariantStats variantStats = new VariantStats();
variantStats.setMaf(variantSearchModel.getStats().get(key));
studyEntryMap.get(fields[1]).setStats(fields[2], variantStats);
} else {
System.out.println("Something wrong happened: stats " + key + ", but there is no study for that stats.");
}
}
}
// process annotation
VariantAnnotation variantAnnotation = new VariantAnnotation();
// consequence types
String gene = null;
String ensGene = null;
Map<String, ConsequenceType> consequenceTypeMap = new HashMap<>();
for (int i = 0; i < variantSearchModel.getGenes().size(); i++) {
if (!variantSearchModel.getGenes().get(i).startsWith("ENS")) {
gene = variantSearchModel.getGenes().get(i);
}
if (variantSearchModel.getGenes().get(i).startsWith("ENSG")) {
ensGene = variantSearchModel.getGenes().get(i);
}
if (variantSearchModel.getGenes().get(i).startsWith("ENST")) {
ConsequenceType consequenceType = new ConsequenceType();
consequenceType.setGeneName(gene);
consequenceType.setEnsemblGeneId(ensGene);
consequenceType.setEnsemblTranscriptId(variantSearchModel.getGenes().get(i));
// setProteinVariantAnnotation is postponed, since it will only be set if SO accession is 1583
// The key is the ENST id
consequenceTypeMap.put(variantSearchModel.getGenes().get(i), consequenceType);
}
}
// prepare protein substitution scores: sift and polyphen
List<Score> scores;
ProteinVariantAnnotation proteinAnnotation = new ProteinVariantAnnotation();
if (variantSearchModel.getSift() != MISSING_VALUE || variantSearchModel.getPolyphen() != MISSING_VALUE) {
scores = new ArrayList<>();
if (variantSearchModel.getSift() != MISSING_VALUE) {
scores.add(new Score(variantSearchModel.getSift(), "sift", variantSearchModel.getSiftDesc()));
}
if (variantSearchModel.getPolyphen() != MISSING_VALUE) {
scores.add(new Score(variantSearchModel.getPolyphen(), "polyphen", variantSearchModel.getPolyphenDesc()));
}
proteinAnnotation.setSubstitutionScores(scores);
}
// and finally, update the SO accession for each consequence type
// and setProteinVariantAnnotation if SO accession is 1583
for (String geneToSoAcc: variantSearchModel.getGeneToSoAcc()) {
String[] fields = geneToSoAcc.split("_");
if (consequenceTypeMap.containsKey(fields[0])) {
int soAcc = Integer.parseInt(fields[1]);
SequenceOntologyTerm sequenceOntologyTerm = new SequenceOntologyTerm();
sequenceOntologyTerm.setAccession("SO:" + String.format("%07d", soAcc));
sequenceOntologyTerm.setName(ConsequenceTypeMappings.accessionToTerm.get(soAcc));
if (consequenceTypeMap.get(fields[0]).getSequenceOntologyTerms() == null) {
consequenceTypeMap.get(fields[0]).setSequenceOntologyTerms(new ArrayList<>());
}
consequenceTypeMap.get(fields[0]).getSequenceOntologyTerms().add(sequenceOntologyTerm);
// only set protein for that consequence type
// if annotated protein and SO accession is 1583 (missense_variant)
if (soAcc == 1583) {
consequenceTypeMap.get(fields[0]).setProteinVariantAnnotation(proteinAnnotation);
}
}
}
// and update the variant annotation with the consequence types
variantAnnotation.setConsequenceTypes(new ArrayList<>(consequenceTypeMap.values()));
// set populations
if (variantSearchModel.getPopFreq() != null && variantSearchModel.getPopFreq().size() > 0) {
List<PopulationFrequency> populationFrequencies = new ArrayList<>();
for (String key : variantSearchModel.getPopFreq().keySet()) {
PopulationFrequency populationFrequency = new PopulationFrequency();
String[] fields = key.split("__");
populationFrequency.setStudy(fields[1]);
populationFrequency.setPopulation(fields[2]);
populationFrequency.setAltAlleleFreq(variantSearchModel.getPopFreq().get(key));
populationFrequencies.add(populationFrequency);
}
variantAnnotation.setPopulationFrequencies(populationFrequencies);
}
// Set conservations scores
scores = new ArrayList<>();
if (variantSearchModel.getPhylop() != MISSING_VALUE) {
scores.add(new Score(variantSearchModel.getPhylop(), "phylop", ""));
}
if (variantSearchModel.getPhastCons() != MISSING_VALUE) {
scores.add(new Score(variantSearchModel.getPhastCons(), "phastCons", ""));
}
if (variantSearchModel.getGerp() != MISSING_VALUE) {
scores.add(new Score(variantSearchModel.getGerp(), "gerp", ""));
}
variantAnnotation.setConservation(scores);
// Set CADD scores
scores = new ArrayList<>();
if (variantSearchModel.getCaddRaw() != MISSING_VALUE) {
scores.add(new Score(variantSearchModel.getCaddRaw(), "cadd_raw", ""));
}
if (variantSearchModel.getCaddScaled() != MISSING_VALUE) {
scores.add(new Score(variantSearchModel.getCaddScaled(), "cadd_scaled", ""));
}
variantAnnotation.setFunctionalScore(scores);
// set HPO, ClinVar and Cosmic
Map<String, List<String>> clinVarMap = new HashMap<>();
List<Cosmic> cosmicList = new ArrayList<>();
List<GeneTraitAssociation> geneTraitAssociationList = new ArrayList<>();
if (variantSearchModel.getTraits() != null) {
for (String trait : variantSearchModel.getTraits()) {
String[] fields = trait.split(" -- ");
switch (fields[0]) {
case "HP":
// Gene trait: HP -- hpo -- id -- name
GeneTraitAssociation geneTraitAssociation = new GeneTraitAssociation();
geneTraitAssociation.setHpo(fields[1]);
geneTraitAssociation.setId(fields[2]);
geneTraitAssociation.setName(fields[3]);
geneTraitAssociationList.add(geneTraitAssociation);
break;
case "CV":
// Variant trait: CV -- accession -- trait
if (!clinVarMap.containsKey(fields[1])) {
clinVarMap.put(fields[1], new ArrayList<>());
}
clinVarMap.get(fields[1]).add(fields[2]);
break;
case "CM":
// Variant trait: CM -- mutation id -- primary histology -- histology subtype
Cosmic cosmic = new Cosmic();
cosmic.setMutationId(fields[1]);
cosmic.setPrimaryHistology(fields[2]);
cosmic.setHistologySubtype(fields[3]);
cosmicList.add(cosmic);
break;
default: {
System.out.println("Unknown trait type: " + fields[0] + ", it should be HPO, ClinVar or Cosmic");
break;
}
}
}
}
VariantTraitAssociation variantTraitAssociation = new VariantTraitAssociation();
List<ClinVar> clinVarList = new ArrayList<>(clinVarMap.size());
for (String key: clinVarMap.keySet()) {
ClinVar clinVar = new ClinVar();
clinVar.setAccession(key);
clinVar.setTraits(clinVarMap.get(key));
clinVarList.add(clinVar);
}
if (clinVarList.size() > 0 || cosmicList.size() > 0) {
if (clinVarList.size() > 0) {
variantTraitAssociation.setClinvar(clinVarList);
}
if (cosmicList.size() > 0) {
variantTraitAssociation.setCosmic(cosmicList);
}
variantAnnotation.setVariantTraitAssociation(variantTraitAssociation);
}
if (geneTraitAssociationList.size() > 0) {
variantAnnotation.setGeneTraitAssociation(geneTraitAssociationList);
}
// set variant annotation
variant.setAnnotation(variantAnnotation);
return variant;
}
/**
* Conversion: from data model to storage type.
*
* @param variant Data model object
* @return Storage type object
*/
@Override
public VariantSearchModel convertToStorageType(Variant variant) {
VariantSearchModel variantSearchModel = new VariantSearchModel();
// Set general Variant attributes: id, dbSNP, chromosome, start, end, type
variantSearchModel.setId(variant.getChromosome() + ":" + variant.getStart() + ":"
+ variant.getReference() + ":" + variant.getAlternate());
variantSearchModel.setChromosome(variant.getChromosome());
variantSearchModel.setStart(variant.getStart());
variantSearchModel.setEnd(variant.getEnd());
variantSearchModel.setVariantId(variant.getId());
variantSearchModel.setType(variant.getType().toString());
// This field contains all possible IDs: id, dbSNP, genes, transcripts, protein, clinvar, hpo, ...
// This will help when searching by variant id. This is added at the end of the method after collecting all IDs
Set<String> xrefs = new HashSet<>();
xrefs.add(variant.getChromosome() + ":" + variant.getStart() + ":" + variant.getReference() + ":" + variant.getAlternate());
xrefs.add(variantSearchModel.getVariantId());
// Set Studies Alias
if (variant.getStudies() != null && variant.getStudies().size() > 0) {
List<String> studies = new ArrayList<>();
Map<String, Float> stats = new HashMap<>();
// variant.getStudies().forEach(s -> studies.add(s.getStudyId()));
for (StudyEntry studyEntry : variant.getStudies()) {
String studyId = studyEntry.getStudyId();
if (studyId.contains(":")) {
String[] split = studyId.split(":");
studyId = split[split.length - 1];
}
studies.add(studyId);
// We store the cohort stats with the format stats_STUDY_COHORT = value, e.g. stats_1kg_phase3_ALL=0.02
if (studyEntry.getStats() != null && studyEntry.getStats().size() > 0) {
Map<String, VariantStats> studyStats = studyEntry.getStats();
for (String key: studyStats.keySet()) {
stats.put("stats__" + studyId + "__" + key, studyStats.get(key).getMaf());
}
}
}
variantSearchModel.setStudies(studies);
variantSearchModel.setStats(stats);
}
// We init all annotation numeric values to MISSING_VALUE, this fixes two different scenarios:
// 1. No Variant Annotation has been found, probably because it is a SV longer than 100bp.
// 2. There are some conservation or CADD scores missing
variantSearchModel.setSift(MISSING_VALUE);
variantSearchModel.setPolyphen(MISSING_VALUE);
variantSearchModel.setPhastCons(MISSING_VALUE);
variantSearchModel.setPhylop(MISSING_VALUE);
variantSearchModel.setGerp(MISSING_VALUE);
variantSearchModel.setCaddRaw(MISSING_VALUE);
variantSearchModel.setCaddScaled(MISSING_VALUE);
// Process Variant Annotation
VariantAnnotation variantAnnotation = variant.getAnnotation();
if (variantAnnotation != null) {
// This object store all info and descriptions for full-text search
Set<String> traits = new HashSet<>();
// Set Genes and Consequence Types
List<ConsequenceType> consequenceTypes = variantAnnotation.getConsequenceTypes();
if (consequenceTypes != null) {
Map<String, Set<String>> genes = new LinkedHashMap<>();
Set<Integer> soAccessions = new LinkedHashSet<>();
Set<String> geneToSOAccessions = new LinkedHashSet<>();
Set<String> biotypes = new LinkedHashSet<>();
for (ConsequenceType consequenceType : consequenceTypes) {
// Set genes and biotypes if exist
if (StringUtils.isNotEmpty(consequenceType.getGeneName())) {
if (!genes.containsKey(consequenceType.getGeneName())) {
genes.put(consequenceType.getGeneName(), new LinkedHashSet<>());
}
genes.get(consequenceType.getGeneName()).add(consequenceType.getGeneName());
genes.get(consequenceType.getGeneName()).add(consequenceType.getEnsemblGeneId());
genes.get(consequenceType.getGeneName()).add(consequenceType.getEnsemblTranscriptId());
xrefs.add(consequenceType.getGeneName());
xrefs.add(consequenceType.getEnsemblGeneId());
xrefs.add(consequenceType.getEnsemblTranscriptId());
if (StringUtils.isNotEmpty(consequenceType.getBiotype())) {
biotypes.add(consequenceType.getBiotype());
}
}
// Remove 'SO:' prefix to Store SO Accessions as integers and also store the relation
// between genes and SO accessions
for (SequenceOntologyTerm sequenceOntologyTerm : consequenceType.getSequenceOntologyTerms()) {
int soNumber = Integer.parseInt(sequenceOntologyTerm.getAccession().substring(3));
soAccessions.add(soNumber);
if (StringUtils.isNotEmpty(consequenceType.getGeneName())) {
geneToSOAccessions.add(consequenceType.getGeneName() + "_" + soNumber);
geneToSOAccessions.add(consequenceType.getEnsemblGeneId() + "_" + soNumber);
geneToSOAccessions.add(consequenceType.getEnsemblTranscriptId() + "_" + soNumber);
}
}
// Set Uniprot accession protein id in xrefs
if (consequenceType.getProteinVariantAnnotation() != null) {
xrefs.add(consequenceType.getProteinVariantAnnotation().getUniprotAccession());
if (consequenceType.getProteinVariantAnnotation().getKeywords() != null) {
for (String keyword : consequenceType.getProteinVariantAnnotation().getKeywords()) {
traits.add("KW -- " + consequenceType.getProteinVariantAnnotation().getUniprotAccession()
+ " -- " + keyword);
}
}
if (consequenceType.getProteinVariantAnnotation().getFeatures() != null) {
for (ProteinFeature proteinFeature : consequenceType.getProteinVariantAnnotation().getFeatures()) {
traits.add("PD -- " + proteinFeature.getId() + " -- " + proteinFeature.getDescription());
}
}
}
}
// We store the accumulated data
genes.forEach((s, strings) -> variantSearchModel.getGenes().addAll(strings));
variantSearchModel.setSoAcc(new ArrayList<>(soAccessions));
variantSearchModel.setGeneToSoAcc(new ArrayList<>(geneToSOAccessions));
variantSearchModel.setBiotypes(new ArrayList<>(biotypes));
// We now process Sift and Polyphen
setProteinScores(consequenceTypes, variantSearchModel);
}
// Set Populations frequencies
if (variantAnnotation.getPopulationFrequencies() != null) {
Map<String, Float> populationFrequencies = new HashMap<>();
for (PopulationFrequency populationFrequency : variantAnnotation.getPopulationFrequencies()) {
populationFrequencies.put("popFreq__" + populationFrequency.getStudy() + "__"
+ populationFrequency.getPopulation(), populationFrequency.getAltAlleleFreq());
}
if (!populationFrequencies.isEmpty()) {
variantSearchModel.setPopFreq(populationFrequencies);
}
}
// Set Conservation scores
if (variantAnnotation.getConservation() != null) {
for (Score score : variantAnnotation.getConservation()) {
switch (score.getSource()) {
case "phastCons":
variantSearchModel.setPhastCons(score.getScore());
break;
case "phylop":
variantSearchModel.setPhylop(score.getScore());
break;
case "gerp":
variantSearchModel.setGerp(score.getScore());
break;
default:
System.out.println("Unknown 'conservation' source: score.getSource() = " + score.getSource());
break;
}
}
}
// Set CADD
if (variantAnnotation.getFunctionalScore() != null) {
for (Score score : variantAnnotation.getFunctionalScore()) {
switch (score.getSource()) {
case "cadd_raw":
case "caddRaw":
variantSearchModel.setCaddRaw(score.getScore());
break;
case "cadd_scaled":
case "caddScaled":
variantSearchModel.setCaddScaled(score.getScore());
break;
default:
System.out.println("Unknown 'functional score' source: score.getSource() = " + score.getSource());
break;
}
}
}
// Set variant traits: ClinVar, Cosmic, HPO, ...
if (variantAnnotation.getVariantTraitAssociation() != null) {
if (variantAnnotation.getVariantTraitAssociation().getClinvar() != null) {
variantAnnotation.getVariantTraitAssociation().getClinvar()
.forEach(cv -> {
xrefs.add(cv.getAccession());
cv.getTraits().forEach(cvt -> traits.add("CV" + " -- " + cv.getAccession() + " -- " + cvt));
});
}
if (variantAnnotation.getVariantTraitAssociation().getCosmic() != null) {
variantAnnotation.getVariantTraitAssociation().getCosmic()
.forEach(cosm -> {
xrefs.add(cosm.getMutationId());
traits.add("CM -- " + cosm.getMutationId() + " -- "
+ cosm.getPrimaryHistology() + " -- " + cosm.getHistologySubtype());
});
}
}
if (variantAnnotation.getGeneTraitAssociation() != null && variantAnnotation.getGeneTraitAssociation().size() > 0) {
for (GeneTraitAssociation geneTraitAssociation : variantAnnotation.getGeneTraitAssociation()) {
switch (geneTraitAssociation.getSource().toLowerCase()) {
case "hpo":
// xrefs.add(geneTraitAssociation.getHpo());
// xrefs.add(geneTraitAssociation.getId());
traits.add("HP -- " + geneTraitAssociation.getHpo() + " -- "
+ geneTraitAssociation.getId() + " -- " + geneTraitAssociation.getName());
break;
// case "disgenet":
// traits.add("DG -- " + geneTraitAssociation.getId() + " -- " + geneTraitAssociation.getName());
// break;
default:
break;
}
}
}
variantSearchModel.setTraits(new ArrayList<>(traits));
}
variantSearchModel.setXrefs(new ArrayList<>(xrefs));
return variantSearchModel;
}
public List<VariantSearchModel> convertListToStorageType(List<Variant> variants) {
List<VariantSearchModel> variantSearchModelList = new ArrayList<>(variants.size());
for (Variant variant: variants) {
VariantSearchModel variantSearchModel = convertToStorageType(variant);
if (variantSearchModel.getId() != null) {
variantSearchModelList.add(variantSearchModel);
}
}
return variantSearchModelList;
}
/**
* Retrieve the protein substitution scores and descriptions from a consequence
* type annotation: sift or polyphen, and update the variant search model.
*
* @param consequenceTypes List of consequence type target
* @param variantSearchModel Variant search model to update
*/
private void setProteinScores(List<ConsequenceType> consequenceTypes, VariantSearchModel variantSearchModel) {
double sift = 10;
String siftDesc = "";
double polyphen = MISSING_VALUE;
String polyphenDesc = "";
if (consequenceTypes != null) {
for (ConsequenceType consequenceType : consequenceTypes) {
if (consequenceType.getProteinVariantAnnotation() != null
&& consequenceType.getProteinVariantAnnotation().getSubstitutionScores() != null) {
for (Score score : consequenceType.getProteinVariantAnnotation().getSubstitutionScores()) {
String source = score.getSource();
if (source.equals("sift")) {
if (score.getScore() < sift) {
sift = score.getScore();
siftDesc = score.getDescription();
}
} else if (source.equals("polyphen")) {
if (score.getScore() > polyphen) {
polyphen = score.getScore();
polyphenDesc = score.getDescription();
}
}
}
}
}
}
// If sift not exist we set it to -100.0
if (sift == 10) {
sift = MISSING_VALUE;
}
// set scores
variantSearchModel.setSift(sift);
variantSearchModel.setPolyphen(polyphen);
// set descriptions
variantSearchModel.setSiftDesc(siftDesc);
variantSearchModel.setPolyphenDesc(polyphenDesc);
}
}