/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.mongodb.variant.converters;
import org.bson.Document;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.VariantAnnotation;
import org.opencb.biodata.models.variant.avro.VariantType;
import org.opencb.commons.datastore.core.ComplexTypeConverter;
import org.opencb.opencga.storage.core.variant.adaptors.VariantField;
import org.opencb.opencga.storage.mongodb.variant.adaptors.VariantMongoDBWriter;
import java.util.*;
import static java.util.Collections.emptyList;
import static java.util.Collections.singletonList;
import static java.util.Collections.unmodifiableMap;
import static org.opencb.opencga.storage.mongodb.variant.converters.DocumentToStudyVariantEntryConverter.*;
import static org.opencb.opencga.storage.mongodb.variant.converters.DocumentToVariantAnnotationConverter.*;
/**
* @author Cristina Yenyxe Gonzalez Garcia <cyenyxe@ebi.ac.uk>
*/
public class DocumentToVariantConverter implements ComplexTypeConverter<Variant, Document> {
public static final String CHROMOSOME_FIELD = "chromosome";
public static final String START_FIELD = "start";
public static final String END_FIELD = "end";
public static final String LENGTH_FIELD = "length";
public static final String REFERENCE_FIELD = "reference";
public static final String ALTERNATE_FIELD = "alternate";
public static final String IDS_FIELD = "ids";
public static final String TYPE_FIELD = "type";
public static final String HGVS_FIELD = "hgvs";
public static final String HGVS_NAME_FIELD = "name";
public static final String HGVS_TYPE_FIELD = "type";
public static final String STUDIES_FIELD = "studies";
public static final String ANNOTATION_FIELD = "annotation";
public static final String CUSTOM_ANNOTATION_FIELD = "customAnnotation";
public static final String STATS_FIELD = "stats";
public static final String AT_FIELD = "_at";
public static final String CHUNK_IDS_FIELD = "chunkIds";
// public static final String ID_FIELD = "id";
// public static final String FILES_FIELD = "files";
// public static final String EFFECTS_FIELD = "effs";
// public static final String SOTERM_FIELD = "so";
// public static final String GENE_FIELD = "gene";
protected static final Map<VariantField, List<String>> FIELDS_MAP;
public static final Set<VariantField> REQUIRED_FIELDS_SET;
static {
Set<VariantField> requiredFieldsSet = new HashSet<>();
requiredFieldsSet.add(VariantField.CHROMOSOME);
requiredFieldsSet.add(VariantField.START);
requiredFieldsSet.add(VariantField.END);
requiredFieldsSet.add(VariantField.REFERENCE);
requiredFieldsSet.add(VariantField.ALTERNATE);
requiredFieldsSet.add(VariantField.TYPE);
REQUIRED_FIELDS_SET = Collections.unmodifiableSet(requiredFieldsSet);
Map<VariantField, List<String>> map = new EnumMap<>(VariantField.class);
map.put(VariantField.ID, singletonList(IDS_FIELD));
map.put(VariantField.CHROMOSOME, singletonList(CHROMOSOME_FIELD));
map.put(VariantField.START, singletonList(START_FIELD));
map.put(VariantField.END, singletonList(END_FIELD));
map.put(VariantField.REFERENCE, singletonList(REFERENCE_FIELD));
map.put(VariantField.ALTERNATE, singletonList(ALTERNATE_FIELD));
map.put(VariantField.LENGTH, singletonList(LENGTH_FIELD));
map.put(VariantField.TYPE, singletonList(TYPE_FIELD));
map.put(VariantField.HGVS, singletonList(HGVS_FIELD));
map.put(VariantField.STUDIES, Arrays.asList(STUDIES_FIELD, STATS_FIELD));
map.put(VariantField.STUDIES_SAMPLES_DATA, Arrays.asList(
STUDIES_FIELD + '.' + GENOTYPES_FIELD,
STUDIES_FIELD + '.' + FILES_FIELD + '.' + FILEID_FIELD,
STUDIES_FIELD + '.' + FILES_FIELD + '.' + SAMPLE_DATA_FIELD
));
map.put(VariantField.STUDIES_FILES, Arrays.asList(
STUDIES_FIELD + '.' + FILES_FIELD + '.' + FILEID_FIELD,
STUDIES_FIELD + '.' + FILES_FIELD + '.' + ATTRIBUTES_FIELD,
STUDIES_FIELD + '.' + FILES_FIELD + '.' + ORI_FIELD));
map.put(VariantField.STUDIES_STATS, singletonList(STATS_FIELD));
map.put(VariantField.STUDIES_SECONDARY_ALTERNATES, singletonList(
STUDIES_FIELD + '.' + ALTERNATES_FIELD));
map.put(VariantField.STUDIES_STUDY_ID, singletonList(
STUDIES_FIELD + '.' + STUDYID_FIELD));
map.put(VariantField.ANNOTATION, singletonList(ANNOTATION_FIELD));
map.put(VariantField.ANNOTATION_ANCESTRAL_ALLELE, emptyList());
map.put(VariantField.ANNOTATION_ID, emptyList());
map.put(VariantField.ANNOTATION_XREFS, singletonList(ANNOTATION_FIELD + '.' + XREFS_FIELD));
map.put(VariantField.ANNOTATION_HGVS, emptyList());
map.put(VariantField.ANNOTATION_DISPLAY_CONSEQUENCE_TYPE, emptyList());
map.put(VariantField.ANNOTATION_CONSEQUENCE_TYPES, singletonList(ANNOTATION_FIELD + '.' + CONSEQUENCE_TYPE_FIELD));
map.put(VariantField.ANNOTATION_POPULATION_FREQUENCIES, singletonList(ANNOTATION_FIELD + '.' + POPULATION_FREQUENCIES_FIELD));
map.put(VariantField.ANNOTATION_MINOR_ALLELE, emptyList());
map.put(VariantField.ANNOTATION_MINOR_ALLELE_FREQ, emptyList());
map.put(VariantField.ANNOTATION_CONSERVATION, Arrays.asList(
ANNOTATION_FIELD + '.' + CONSERVED_REGION_PHYLOP_FIELD,
ANNOTATION_FIELD + '.' + CONSERVED_REGION_PHASTCONS_FIELD,
ANNOTATION_FIELD + '.' + CONSERVED_REGION_GERP_FIELD
));
map.put(VariantField.ANNOTATION_GENE_EXPRESSION, emptyList());
map.put(VariantField.ANNOTATION_GENE_TRAIT_ASSOCIATION, singletonList(ANNOTATION_FIELD + '.' + GENE_TRAIT_FIELD));
map.put(VariantField.ANNOTATION_GENE_DRUG_INTERACTION, singletonList(ANNOTATION_FIELD + '.' + DRUG_FIELD));
map.put(VariantField.ANNOTATION_VARIANT_TRAIT_ASSOCIATION, singletonList(ANNOTATION_FIELD + '.' + CLINICAL_DATA_FIELD));
map.put(VariantField.ANNOTATION_FUNCTIONAL_SCORE, Arrays.asList(
ANNOTATION_FIELD + '.' + FUNCTIONAL_CADD_RAW_FIELD,
ANNOTATION_FIELD + '.' + FUNCTIONAL_CADD_SCALED_FIELD));
map.put(VariantField.ANNOTATION_ADDITIONAL_ATTRIBUTES, singletonList(CUSTOM_ANNOTATION_FIELD));
FIELDS_MAP = unmodifiableMap(map);
}
private DocumentToStudyVariantEntryConverter variantStudyEntryConverter;
private Set<Integer> returnStudies;
private DocumentToVariantAnnotationConverter variantAnnotationConverter;
private DocumentToVariantStatsConverter statsConverter;
private final VariantStringIdConverter idConverter = new VariantStringIdConverter();
// Add default variant ID if it is missing. Use CHR:POS:REF:ALT
private boolean addDefaultId;
/**
* Create a converter between {@link Variant} and {@link Document} entities when there is
* no need to convert the studies the variant was read from.
*/
public DocumentToVariantConverter() {
this(null, null);
}
/**
* Create a converter between {@link Variant} and {@link Document} entities. A converter for
* the studies the variant was read from can be provided in case those
* should be processed during the conversion.
*
* @param variantStudyEntryConverter The object used to convert the files
* @param statsConverter Stats converter
*/
public DocumentToVariantConverter(DocumentToStudyVariantEntryConverter variantStudyEntryConverter,
DocumentToVariantStatsConverter statsConverter) {
this(variantStudyEntryConverter, statsConverter, null);
}
/**
* Create a converter between {@link Variant} and {@link Document} entities. A converter for
* the studies the variant was read from can be provided in case those
* should be processed during the conversion.
*
* @param variantStudyEntryConverter The object used to convert the files
* @param statsConverter Stats converter
* @param returnStudies List of studies to return
*/
public DocumentToVariantConverter(DocumentToStudyVariantEntryConverter variantStudyEntryConverter,
DocumentToVariantStatsConverter statsConverter, Collection<Integer> returnStudies) {
this.variantStudyEntryConverter = variantStudyEntryConverter;
this.variantAnnotationConverter = new DocumentToVariantAnnotationConverter();
this.statsConverter = statsConverter;
addDefaultId = true;
if (returnStudies != null) {
if (returnStudies instanceof Set) {
this.returnStudies = (Set<Integer>) returnStudies;
} else {
this.returnStudies = new HashSet<>(returnStudies);
}
}
}
@Override
public Variant convertToDataModelType(Document object) {
String chromosome = (String) object.get(CHROMOSOME_FIELD);
int start = (int) object.get(START_FIELD);
int end = (int) object.get(END_FIELD);
String reference = (String) object.get(REFERENCE_FIELD);
String alternate = (String) object.get(ALTERNATE_FIELD);
Variant variant = new Variant(chromosome, start, end, reference, alternate);
if (object.containsKey(IDS_FIELD)) {
LinkedList<String> ids = new LinkedList<>(object.get(IDS_FIELD, Collection.class));
if (ids.isEmpty()) {
if (addDefaultId) {
variant.setId(variant.toString());
}
variant.setNames(emptyList());
} else {
variant.setId(ids.get(0));
variant.setNames(ids.subList(1, ids.size()));
}
}
if (object.containsKey(TYPE_FIELD)) {
variant.setType(VariantType.valueOf(object.get(TYPE_FIELD).toString()));
}
// Transform HGVS: List of map entries -> Map of lists
List mongoHgvs = (List) object.get(HGVS_FIELD);
if (mongoHgvs != null) {
for (Object o : mongoHgvs) {
Document dbo = (Document) o;
variant.addHgvs((String) dbo.get(HGVS_TYPE_FIELD), (String) dbo.get(HGVS_NAME_FIELD));
}
}
// Files
if (variantStudyEntryConverter != null) {
List mongoFiles = object.get(STUDIES_FIELD, List.class);
if (mongoFiles != null) {
for (Object o : mongoFiles) {
Document dbo = (Document) o;
if (returnStudies == null || returnStudies.contains(((Number) dbo.get(STUDYID_FIELD)).intValue())) {
variant.addStudyEntry(variantStudyEntryConverter.convertToDataModelType(dbo));
}
}
}
}
// Annotations
Document mongoAnnotation;
Object o = object.get(ANNOTATION_FIELD);
if (o instanceof List) {
if (!((List) o).isEmpty()) {
mongoAnnotation = (Document) ((List) o).get(0);
} else {
mongoAnnotation = null;
}
} else {
mongoAnnotation = (Document) object.get(ANNOTATION_FIELD);
}
Document customAnnotation = object.get(CUSTOM_ANNOTATION_FIELD, Document.class);
if (mongoAnnotation != null || customAnnotation != null) {
VariantAnnotation annotation;
if (mongoAnnotation != null) {
annotation = variantAnnotationConverter
.convertToDataModelType(mongoAnnotation, customAnnotation);
} else {
annotation = new VariantAnnotation();
annotation.setAdditionalAttributes(variantAnnotationConverter.convertAdditionalAttributesToDataModelType(customAnnotation));
}
annotation.setChromosome(variant.getChromosome());
annotation.setAlternate(variant.getAlternate());
annotation.setReference(variant.getReference());
annotation.setStart(variant.getStart());
variant.setAnnotation(annotation);
}
// Statistics
if (statsConverter != null && object.containsKey(STATS_FIELD)) {
List<Document> stats = object.get(STATS_FIELD, List.class);
statsConverter.convertCohortsToDataModelType(stats, variant);
}
return variant;
}
@Override
public Document convertToStorageType(Variant variant) {
// Attributes easily calculated
Document mongoVariant = new Document("_id", buildStorageId(variant))
// .append(IDS_FIELD, object.getIds()) //Do not include IDs.
.append(CHROMOSOME_FIELD, variant.getChromosome())
.append(START_FIELD, variant.getStart())
.append(END_FIELD, variant.getEnd())
.append(LENGTH_FIELD, variant.getLength())
.append(REFERENCE_FIELD, variant.getReference())
.append(ALTERNATE_FIELD, variant.getAlternate())
.append(TYPE_FIELD, variant.getType().name());
// Internal fields used for query optimization (dictionary named "_at")
Document at = new Document();
mongoVariant.append(AT_FIELD, at);
// Two different chunk sizes are calculated for different resolution levels: 1k and 10k
List<String> chunkIds = new LinkedList<>();
String chunkSmall = variant.getChromosome() + "_" + variant.getStart() / VariantMongoDBWriter.CHUNK_SIZE_SMALL + "_"
+ VariantMongoDBWriter.CHUNK_SIZE_SMALL / 1000 + "k";
String chunkBig = variant.getChromosome() + "_" + variant.getStart() / VariantMongoDBWriter.CHUNK_SIZE_BIG + "_"
+ VariantMongoDBWriter.CHUNK_SIZE_BIG / 1000 + "k";
chunkIds.add(chunkSmall);
chunkIds.add(chunkBig);
at.append(CHUNK_IDS_FIELD, chunkIds);
// Transform HGVS: Map of lists -> List of map entries
List<Document> hgvs = new LinkedList<>();
for (Map.Entry<String, List<String>> entry : variant.getHgvs().entrySet()) {
for (String value : entry.getValue()) {
hgvs.add(new Document(HGVS_TYPE_FIELD, entry.getKey()).append(HGVS_NAME_FIELD, value));
}
}
mongoVariant.append(HGVS_FIELD, hgvs);
// Files
if (variantStudyEntryConverter != null) {
List<Document> mongoFiles = new LinkedList<>();
for (StudyEntry archiveFile : variant.getStudies()) {
mongoFiles.add(variantStudyEntryConverter.convertToStorageType(variant, archiveFile));
}
mongoVariant.append(STUDIES_FIELD, mongoFiles);
}
// // Annotations
mongoVariant.append(ANNOTATION_FIELD, emptyList());
if (variantAnnotationConverter != null) {
if (variant.getAnnotation() != null
&& variant.getAnnotation().getConsequenceTypes() != null
&& !variant.getAnnotation().getConsequenceTypes().isEmpty()) {
Document annotation = variantAnnotationConverter.convertToStorageType(variant.getAnnotation());
mongoVariant.append(ANNOTATION_FIELD, singletonList(annotation));
}
}
// Statistics
if (statsConverter != null) {
List mongoStats = statsConverter.convertCohortsToStorageType(variant.getStudiesMap());
mongoVariant.put(STATS_FIELD, mongoStats);
}
return mongoVariant;
}
public String buildStorageId(Variant v) {
return idConverter.buildId(v);
// return buildStorageId(v.getChromosome(), v.getStart(), v.getReference(), v.getAlternate());
}
public String buildStorageId(String chromosome, int start, String reference, String alternate) {
return idConverter.buildId(chromosome, start, reference, alternate);
//
// StringBuilder builder = new StringBuilder(chromosome);
// builder.append("_");
// builder.append(start);
// builder.append("_");
// if (reference.equals("-")) {
// System.out.println("Empty block");
// } else if (reference.length() < Variant.SV_THRESHOLD) {
// builder.append(reference);
// } else {
// builder.append(new String(CryptoUtils.encryptSha1(reference)));
// }
//
// builder.append("_");
//
// if (alternate.equals("-")) {
// System.out.println("Empty block");
// } else if (alternate.length() < Variant.SV_THRESHOLD) {
// builder.append(alternate);
// } else {
// builder.append(new String(CryptoUtils.encryptSha1(alternate)));
// }
//
// return builder.toString();
}
public static List<String> toShortFieldName(VariantField field) {
return FIELDS_MAP.get(field);
}
}