/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.hadoop.variant.converters.annotation;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hbase.client.Put;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.*;
import org.opencb.biodata.tools.variant.converters.Converter;
import org.opencb.opencga.storage.hadoop.variant.GenomeHelper;
import org.opencb.opencga.storage.hadoop.variant.converters.AbstractPhoenixConverter;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.PhoenixHelper;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.VariantPhoenixHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import static org.opencb.opencga.storage.hadoop.variant.index.phoenix.VariantPhoenixHelper.VariantColumn.*;
/**
* Created on 01/12/15.
*
* @author Jacobo Coll <jacobo167@gmail.com>
*/
public class VariantAnnotationToHBaseConverter extends AbstractPhoenixConverter
implements Converter<VariantAnnotation, Map<PhoenixHelper.Column, ?>> {
private final GenomeHelper genomeHelper;
private boolean addFullAnnotation = true;
public VariantAnnotationToHBaseConverter(GenomeHelper genomeHelper) {
super(genomeHelper.getColumnFamily());
this.genomeHelper = genomeHelper;
}
private final Logger logger = LoggerFactory.getLogger(VariantAnnotationToHBaseConverter.class);
@Override
public Map<PhoenixHelper.Column, ?> convert(VariantAnnotation variantAnnotation) {
HashMap<PhoenixHelper.Column, Object> map = new HashMap<>();
if (addFullAnnotation) {
map.put(FULL_ANNOTATION, variantAnnotation.toString());
}
Set<String> genes = new HashSet<>();
Set<String> transcripts = new HashSet<>();
Set<String> flags = new HashSet<>();
Set<Integer> so = new HashSet<>();
Set<String> biotype = new HashSet<>();
Set<Double> polyphen = new HashSet<>();
Set<Double> sift = new HashSet<>();
Set<String> polyphenDesc = new HashSet<>();
Set<String> siftDesc = new HashSet<>();
Set<String> geneTraitName = new HashSet<>();
Set<String> geneTraitId = new HashSet<>();
Set<String> hpo = new HashSet<>();
Set<String> drugs = new HashSet<>();
Set<String> proteinKeywords = new HashSet<>();
// Contains all the xrefs, and the id, the geneNames and transcripts
Set<String> xrefs = new HashSet<>();
addNotNull(xrefs, variantAnnotation.getId());
for (ConsequenceType consequenceType : variantAnnotation.getConsequenceTypes()) {
addNotNull(genes, consequenceType.getGeneName());
addNotNull(genes, consequenceType.getEnsemblGeneId());
addNotNull(transcripts, consequenceType.getEnsemblTranscriptId());
addNotNull(biotype, consequenceType.getBiotype());
addAllNotNull(flags, consequenceType.getTranscriptAnnotationFlags());
for (SequenceOntologyTerm sequenceOntologyTerm : consequenceType.getSequenceOntologyTerms()) {
String accession = sequenceOntologyTerm.getAccession();
addNotNull(so, Integer.parseInt(accession.substring(3)));
}
if (consequenceType.getProteinVariantAnnotation() != null) {
if (consequenceType.getProteinVariantAnnotation().getSubstitutionScores() != null) {
for (Score score : consequenceType.getProteinVariantAnnotation().getSubstitutionScores()) {
if (score.getSource().equalsIgnoreCase("sift")) {
addNotNull(sift, score.getScore());
addNotNull(siftDesc, score.getDescription());
} else if (score.getSource().equalsIgnoreCase("polyphen")) {
addNotNull(polyphen, score.getScore());
addNotNull(polyphenDesc, score.getDescription());
}
}
}
if (consequenceType.getProteinVariantAnnotation().getKeywords() != null) {
proteinKeywords.addAll(consequenceType.getProteinVariantAnnotation().getKeywords());
}
addNotNull(xrefs, consequenceType.getProteinVariantAnnotation().getUniprotName());
addNotNull(xrefs, consequenceType.getProteinVariantAnnotation().getUniprotAccession());
addNotNull(xrefs, consequenceType.getProteinVariantAnnotation().getUniprotVariantId());
}
}
if (variantAnnotation.getVariantTraitAssociation() != null) {
if (variantAnnotation.getVariantTraitAssociation().getCosmic() != null) {
for (Cosmic cosmic : variantAnnotation.getVariantTraitAssociation().getCosmic()) {
addNotNull(xrefs, cosmic.getMutationId());
}
}
if (variantAnnotation.getVariantTraitAssociation().getClinvar() != null) {
for (ClinVar clinVar : variantAnnotation.getVariantTraitAssociation().getClinvar()) {
addNotNull(xrefs, clinVar.getAccession());
}
}
}
xrefs.addAll(genes);
xrefs.addAll(transcripts);
if (variantAnnotation.getXrefs() != null) {
for (Xref xref : variantAnnotation.getXrefs()) {
addNotNull(xrefs, xref.getId());
}
}
if (variantAnnotation.getGeneTraitAssociation() != null) {
for (GeneTraitAssociation geneTrait : variantAnnotation.getGeneTraitAssociation()) {
addNotNull(geneTraitName, geneTrait.getName());
addNotNull(geneTraitId, geneTrait.getId());
addNotNull(hpo, geneTrait.getHpo());
}
}
xrefs.addAll(hpo);
if (variantAnnotation.getGeneDrugInteraction() != null) {
for (GeneDrugInteraction drug : variantAnnotation.getGeneDrugInteraction()) {
addNotNull(drugs, drug.getDrugName());
}
}
map.put(CHROMOSOME, variantAnnotation.getChromosome());
map.put(POSITION, variantAnnotation.getStart());
map.put(REFERENCE, variantAnnotation.getReference());
map.put(ALTERNATE, variantAnnotation.getAlternate());
map.put(GENES, genes);
map.put(TRANSCRIPTS, transcripts);
map.put(BIOTYPE, biotype);
map.put(SO, so);
map.put(POLYPHEN, sortProteinSubstitutionScores(polyphen));
map.put(POLYPHEN_DESC, polyphenDesc);
map.put(SIFT, sortProteinSubstitutionScores(sift));
map.put(SIFT_DESC, siftDesc);
map.put(TRANSCRIPTION_FLAGS, flags);
map.put(GENE_TRAITS_ID, geneTraitId);
map.put(PROTEIN_KEYWORDS, proteinKeywords);
map.put(GENE_TRAITS_NAME, geneTraitName);
map.put(HPO, hpo);
map.put(DRUG, drugs);
map.put(XREFS, xrefs);
if (variantAnnotation.getConservation() != null) {
for (Score score : variantAnnotation.getConservation()) {
PhoenixHelper.Column column = VariantPhoenixHelper.getConservationScoreColumn(score.getSource());
map.put(column, score.getScore());
}
}
if (variantAnnotation.getPopulationFrequencies() != null) {
for (PopulationFrequency pf : variantAnnotation.getPopulationFrequencies()) {
PhoenixHelper.Column column = VariantPhoenixHelper.getPopulationFrequencyColumn(pf.getStudy(), pf.getPopulation());
map.put(column, Arrays.asList(pf.getRefAlleleFreq(), pf.getAltAlleleFreq()));
}
}
if (variantAnnotation.getFunctionalScore() != null) {
for (Score score : variantAnnotation.getFunctionalScore()) {
PhoenixHelper.Column column = VariantPhoenixHelper.getFunctionalScoreColumn(score.getSource());
map.put(column, score.getScore());
}
}
VariantType variantType = Variant.inferType(variantAnnotation.getReference(),
variantAnnotation.getAlternate(),
variantAnnotation.getReference().length());
if (StringUtils.isNotBlank(variantAnnotation.getId())) {
if (variantType.equals(VariantType.SNV)) {
variantType = VariantType.SNP;
} else if (variantType.equals(VariantType.MNV)) {
variantType = VariantType.MNP;
}
}
map.put(TYPE, variantType.toString());
return map;
}
Put buildPut(VariantAnnotation variantAnnotation, Map<PhoenixHelper.Column, ?> map) {
byte[] bytesRowKey = genomeHelper.generateVariantRowKey(variantAnnotation.getChromosome(), variantAnnotation.getStart(),
variantAnnotation.getReference(), variantAnnotation.getAlternate());
Put put = new Put(bytesRowKey);
map.forEach((column, value) -> add(put, column, value));
return put;
}
private List<Double> sortProteinSubstitutionScores(Set<Double> scores) {
List<Double> sorted = new ArrayList<>(scores.size());
Double min = scores.stream().min(Double::compareTo).orElse(-1.0);
Double max = scores.stream().max(Double::compareTo).orElse(-1.0);
if (min >= 0) {
sorted.add(min);
sorted.add(max);
scores.remove(min);
scores.remove(max);
sorted.addAll(scores);
}
return sorted;
}
}