/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.mongodb.variant.load.variants;
import com.mongodb.MongoExecutionTimeoutException;
import org.apache.commons.lang3.StringUtils;
import org.bson.BsonArray;
import org.bson.Document;
import org.bson.conversions.Bson;
import org.bson.types.Binary;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantNormalizer;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.avro.VariantType;
import org.opencb.biodata.tools.variant.merge.VariantMerger;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryResult;
import org.opencb.commons.run.ParallelTaskRunner;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor;
import org.opencb.opencga.storage.mongodb.variant.converters.DocumentToSamplesConverter;
import org.opencb.opencga.storage.mongodb.variant.converters.DocumentToStudyVariantEntryConverter;
import org.opencb.opencga.storage.mongodb.variant.converters.DocumentToVariantConverter;
import org.opencb.opencga.storage.mongodb.variant.converters.VariantStringIdConverter;
import org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageLoader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.stream.Collectors;
import static com.mongodb.client.model.Filters.and;
import static com.mongodb.client.model.Filters.eq;
import static com.mongodb.client.model.Updates.*;
import static org.opencb.opencga.storage.mongodb.variant.MongoDBVariantStorageEngine.MongoDBVariantOptions.DEFAULT_GENOTYPE;
import static org.opencb.opencga.storage.mongodb.variant.converters.DocumentToSamplesConverter.UNKNOWN_GENOTYPE;
import static org.opencb.opencga.storage.mongodb.variant.converters.DocumentToStudyVariantEntryConverter.*;
import static org.opencb.opencga.storage.mongodb.variant.converters.DocumentToVariantConverter.IDS_FIELD;
import static org.opencb.opencga.storage.mongodb.variant.converters.DocumentToVariantConverter.STUDIES_FIELD;
import static org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageLoader.STRING_ID_CONVERTER;
import static org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageLoader.VARIANT_CONVERTER_DEFAULT;
/**
* Created on 07/04/16.
*
* Merges data from the Stage collection into the Variant collection.
*
* There are different situations depending on how the data is coming or depending on
* the current variants configuration.
*
* ### Depending on the configuration of the variants
*
* Easy scenarios:
*
* +---------------+------+
* | Stage | Vars |
* +-------+-------+------+
* | File1 | File2 | Vars |
* +-------------+-------+-------+------+
* | 1:100:A:C | DATA | DATA | DATA | <--- A1) Update variants with data from File1 and File2
* +-------------+-------+-------+------+
* | 1:125:A:C | DATA | DATA | ---- | <--- A2) New variant
* +-------------+-------+-------+------+
* | 1:150:G:T | ---- | ---- | DATA | <--- A3) Fill Gaps. If new study, skip.
* +-------------+-------+-------+------+
*
* Some partial information from the stage collection. Fill gaps
* +-------------+-------+-------+------+
* | 1:200:A:C | ---- | DATA | DATA | <--- B1) Update variants with data from File2 and missing from File1
* +-------------+-------+-------+------+
* | 1:225:A:C | DATA | ---- | ---- | <--- B2) New variant with missing information from the File2
* +-------------+-------+-------+------+
* | 1:250:G:T | ---- | ---- | DATA | <--- B3) Missing information. Fill Gaps. If new study, skip.
* +-------------+-------+-------+------+
*
* Overlapping variants
* +-------------+-------+-------+------+
* | 1:300:A:C | ---- | DATA | DATA | <---
* +-------------+-------+-------+------+ |- C1) Simple merge variants
* | 1:300:A:T | DATA | ---- | DATA | <---
* +=============+=======+=======+======+
* | 1:400:A:C | DATA | DATA | DATA | <---
* +-------------+-------+-------+------+ |- C2) Simple merge variants. File1 has duplicated information for this variants
* | 1:400:A:T | DATA | ---- | DATA | <--- Ideally, variants for File1 have correct secondary alternate. Discard one variant.
* +=============+=======+=======+======+
* | 1:500:A:C | ---- | ---- | DATA | <---
* +-------------+-------+-------+------+ |- C3) New overlapped region. Fetch data from the database and merge.
* | 1:500:A:T | DATA | DATA | ---- | <--- Fill gaps for indexed files
* +=============+=======+=======+======+
* | 1:600:A:C | ---- | ---- | DATA | <---
* +-------------+-------+-------+------+ |- C4) New overlapped region. Fetch data from the database and merge.
* | 1:600:A:T | DATA | ---- | ---- | <--- Fill gaps for indexed files and file2
* +=============+=======+=======+======+
* | 1:700:A:C | ---- | ---- | DATA | <---
* +-------------+-------+-------+------+ |- C5) Already existing overlapped region. No extra information. No overlapping variants.
* | 1:700:A:T | ---- | ---- | DATA | <--- Fill gaps for file1 and file2
* +=============+=======+=======+======+
*
* Duplicated variants. Do not load duplicated variants.
* +-------------+-------+-------+------+
* | 1:200:A:C | ---- | DATA* | DATA | <--- D1) Duplicated variants in file 2. Fill gaps for file1 and file2.
* +-------------+-------+-------+------+ Skip if new study (nonInserted++)
* | 1:250:G:T | ---- | DATA* | ---- | <--- D2) Duplicated variants in file 2, missing for the rest.
* +-------------+-------+-------+------+ Skip variant! (nonInserted++)
* | 1:225:A:C | DATA | DATA* | DATA | <--- D3) Duplicated variants in file 2. Fill gaps for file2 and insert file1.
* +-------------+-------+-------+------+
* | 1:225:A:C | DATA | DATA* | ---- | <--- D4) Duplicated variants in file 2. Fill gaps for file2 and insert file1.
* +-------------+-------+-------+------+
*
*
* ### Depending on how the data is splitted in files, and how the files are sent.
*
* The data can came splitted by chromosomes and/or by batches of samples. For the
* next tables, columns are batches of samples, and the rows are different regions,
* for example, chromosomes.
* The cells represents a file. If the file is already merged on the database uses an X,
* or if it's being loaded right now, an O.
*
* In each scenario, must be aware when filling gaps (for new or missing variants). Have to
* select properly the "indexed files" for each region.
*
* X = File loaded and merged
* O = To be merged
*
* a) Merging one file having other merged files from a same sample set and chromosome.
* +---+---+---+
* |S1 |S2 |S3 |
* +------+---+---+---+
* | Chr1 | X | X | |
* +------+---+---+---+
* | Chr2 | X | O | |
* +------+---+---+---+
* | Chr3 | X | | |
* +------+---+---+---+
* Indexed files = {f_21}
*
* a.2) Merging different files from the same sample set.
* Must be aware when filling gaps (for new or missing variants).
* In this case, merge chromosome by chromosome in different calls.
* +---+---+---+
* |S1 |S2 |S3 |
* +------+---+---+---+
* | Chr1 | X | X | X |
* +------+---+---+---+
* | Chr2 | X | X | O |
* +------+---+---+---+
* | Chr3 | X | X | O |
* +------+---+---+---+
* Indexed files in Chr2 = {f_21, f_22}
* Indexed files in Chr3 = {f_31, f_32}
*
* b) Merging one or more files from the same chromosome
* Having the indexed files in this region, no special consideration.
* +---+---+---+---+
* |S1 |S2 |S3 |S4 |
* +------+---+---+---+---+
* | Chr1 | X | X | X | X |
* +------+---+---+---+---+
* | Chr2 | X | X | O | O |
* +------+---+---+---+---+
* | Chr3 | X | | | |
* +------+---+---+---+---+
* Indexed files = {f_21, f_22}
*
* c) Mix different regions and Sample sets
* Same situation as case a.2
* +---+---+---+
* |S1 |S2 |S3 |
* +------+---+---+---+
* | Chr1 | X | X | O |
* +------+---+---+---+
* | Chr2 | X | X | |
* +------+---+---+---+
* | Chr3 | X | O | |
* +------+---+---+---+
* Indexed files in Chr1 = {f_11, f_12}
* Indexed files in Chr3 = {f_31}
*
* c) Mix different regions sizes. A single chromosome and a whole genome.
* Do not allow this!
* +---+---+---+
* |S1 |S2 |S3 |
* +------+---+---+---+
* | Chr1 | X | X | O |
* +------+---+---+ O |
* | Chr2 | X | O | O |
* +------+---+---+ O |
* | Chr3 | X | X | O |
* +------+---+---+---+
*
* @author Jacobo Coll <jacobo167@gmail.com>
*/
public class MongoDBVariantMerger implements ParallelTaskRunner.Task<Document, MongoDBOperations> {
private final VariantDBAdaptor dbAdaptor;
/** Study to be merged. */
private final Integer studyId;
private final String studyIdStr;
/** Files to be merged. */
private final List<Integer> fileIds;
/** Indexed files in the region that we are merging. */
private final Set<Integer> indexedFiles;
/**
* Check overlapping variants.
* Only needed when loading more than one file at the same time, or there were other loaded files in the same region
**/
private boolean checkOverlappings;
private final DocumentToVariantConverter variantConverter;
private final DocumentToStudyVariantEntryConverter studyConverter;
private final StudyConfiguration studyConfiguration;
private final boolean excludeGenotypes;
private final boolean addUnknownGenotypes;
// Variables that must be aware of concurrent modification
private final Map<Integer, LinkedHashMap<String, Integer>> samplesPositionMap;
private final List<Integer> indexedSamples;
private final Logger logger = LoggerFactory.getLogger(MongoDBVariantMerger.class);
private final VariantMerger variantMerger;
private final List<String> format;
private boolean resume;
public MongoDBVariantMerger(VariantDBAdaptor dbAdaptor, StudyConfiguration studyConfiguration, List<Integer> fileIds,
Set<Integer> indexedFiles, boolean resume, boolean ignoreOverlapping) {
this.dbAdaptor = Objects.requireNonNull(dbAdaptor);
this.studyConfiguration = Objects.requireNonNull(studyConfiguration);
this.fileIds = Objects.requireNonNull(fileIds);
this.indexedFiles = Objects.requireNonNull(indexedFiles);
excludeGenotypes = getExcludeGenotypes(studyConfiguration);
format = buildFormat(studyConfiguration);
indexedSamples = Collections.unmodifiableList(buildIndexedSamplesList(fileIds));
studyId = studyConfiguration.getStudyId();
studyIdStr = String.valueOf(studyId);
String defaultGenotype = studyConfiguration.getAttributes().getString(DEFAULT_GENOTYPE.key(), "");
if (defaultGenotype.equals(DocumentToSamplesConverter.UNKNOWN_GENOTYPE)) {
logger.debug("Do not need fill unknown genotype array. DefaultGenotype is UNKNOWN_GENOTYPE({}).",
DocumentToSamplesConverter.UNKNOWN_GENOTYPE);
addUnknownGenotypes = false;
} else if (excludeGenotypes) {
logger.debug("Do not need fill unknown genotype array. Excluding genotypes.");
addUnknownGenotypes = false;
} else {
addUnknownGenotypes = true;
}
checkOverlappings = !ignoreOverlapping && (fileIds.size() > 1 || !indexedFiles.isEmpty());
DocumentToSamplesConverter samplesConverter = new DocumentToSamplesConverter(this.studyConfiguration);
studyConverter = new DocumentToStudyVariantEntryConverter(false, samplesConverter);
variantConverter = new DocumentToVariantConverter(studyConverter, null);
samplesPositionMap = new HashMap<>();
variantMerger = new VariantMerger();
this.resume = resume;
}
@Override
public List<MongoDBOperations> apply(List<Document> batch) {
try {
return Collections.singletonList(merge(batch));
} catch (Exception e) {
if (batch.isEmpty()) {
logger.error("Fail loading empty batch");
} else {
logger.error("Fail loading batch from " + batch.get(0).get("_id") + " to " + batch.get(batch.size() - 1).get("_id"));
}
throw e;
}
}
public MongoDBOperations merge(List<Document> variants) {
// Set of operations to be executed in the Database
MongoDBOperations mongoDBOps = new MongoDBOperations();
Variant previousVariant = null;
Document previousDocument = null;
int start = 0;
int end = 0;
String chromosome = null;
List<Document> overlappedVariants = null;
Iterator<Document> iterator = variants.iterator();
// Get first valid variant
while (iterator.hasNext()) {
Document document = iterator.next();
if (document.get(studyIdStr) != null) {
previousDocument = document;
previousVariant = STRING_ID_CONVERTER.convertToDataModelType(previousDocument);
chromosome = previousVariant.getChromosome();
start = previousVariant.getStart();
end = getEnd(previousVariant);
break;
}
}
while (iterator.hasNext()) {
Document document = iterator.next();
Variant variant = STRING_ID_CONVERTER.convertToDataModelType(document);
Document study = document.get(studyIdStr, Document.class);
if (study != null) {
if (checkOverlappings && variant.overlapWith(chromosome, start, end, true)) {
// If the variant overlaps with the last one, add to the overlappedVariants list.
// Do not process any variant yet!
if (overlappedVariants == null) {
overlappedVariants = new ArrayList<>();
overlappedVariants.add(previousDocument);
}
overlappedVariants.add(document);
// Take min start and max end
start = Math.min(start, variant.getStart());
end = Math.max(end, getEnd(variant));
} else {
// If the current variant does not overlap with the previous variant, we can load the previous variant (or region)
processVariants(overlappedVariants, previousDocument, previousVariant, mongoDBOps);
overlappedVariants = null;
// Reset region
chromosome = variant.getChromosome();
start = variant.getStart();
end = getEnd(variant);
}
previousDocument = document;
previousVariant = variant;
}
}
// Process remaining variants
processVariants(overlappedVariants, previousDocument, previousVariant, mongoDBOps);
// // Execute MongoDB Operations
// return executeMongoDBOperations(mongoDBOps);
return mongoDBOps;
}
public void processVariants(List<Document> overlappedVariants, Document document, Variant variant,
MongoDBOperations mongoDBOps) {
try {
if (overlappedVariants != null) {
for (Document overlappedVariant : overlappedVariants) {
if (alreadyProcessedStageDocument(overlappedVariant)) {
// Skip this batch if any of the documents is already processed
mongoDBOps.setMissingVariantsNoFillGaps(mongoDBOps.getMissingVariantsNoFillGaps() + overlappedVariants.size());
return;
}
}
processOverlappedVariants(overlappedVariants, mongoDBOps);
} else if (document != null) {
if (alreadyProcessedStageDocument(document)) {
mongoDBOps.setMissingVariantsNoFillGaps(mongoDBOps.getMissingVariantsNoFillGaps() + 1);
return;
}
processVariant(document, variant, mongoDBOps);
}
} catch (Exception e) {
logger.error("Error processing variant " + variant, e);
throw e;
}
}
public boolean alreadyProcessedStageDocument(Document overlappedVariant) {
Document study = overlappedVariant.get(studyIdStr, Document.class);
for (Integer fileId : fileIds) {
if (study.containsKey(fileId.toString())) {
// If any of the files is null, the document is already processed.
return study.get(fileId.toString()) == null;
}
}
return false;
}
public Integer getEnd(Variant variant) {
// if (variant.getType().equals(VariantType.SYMBOLIC) || variant.getType().equals(VariantType.NO_VARIATION)) {
// return variant.getEnd();
// } else {
// return variant.getStart() + Math.max(variant.getReference().length() - 1, -1 /* 0 */);
// }
if (EnumSet.of(VariantType.SYMBOLIC, VariantType.CNV).contains(variant.getType())) {
return variant.getStart();
} else {
return variant.getEnd();
}
}
/**
* Given a document from the stage collection, transforms the document into a set of MongoDB operations.
*
* It may be a new variant document in the database, a new study in the document, or just an update of an existing study variant.
*
* @param document Document to load
* @param emptyVar Parsed empty variant of the document. Only chr, pos, ref, alt
* @param mongoDBOps Set of MongoDB operations to update
*/
protected void processVariant(Document document, Variant emptyVar, MongoDBOperations mongoDBOps) {
Document study = document.get(studyIdStr, Document.class);
// New variant in the study.
boolean newStudy = isNewStudy(study);
// New variant in the collection if new variant and document size is 2 {_id, study}
boolean newVariant = isNewVariant(document, newStudy);
Set<String> ids = new HashSet<>();
List<Document> fileDocuments = new LinkedList<>();
List<Document> alternateDocuments = new LinkedList<>();
Document gts = new Document();
// Loop for each file that have to be merged
int missing = 0;
int skipped = 0;
int duplicated = 0;
for (Integer fileId : fileIds) {
// Different actions if the file is present or missing in the document.
if (study.containsKey(fileId.toString())) {
//Duplicated documents are treated like missing. Increment the number of duplicated variants
List<Binary> duplicatedVariants = getListFromDocument(study, fileId.toString());
if (duplicatedVariants.size() > 1) {
mongoDBOps.setNonInserted(mongoDBOps.getNonInserted() + duplicatedVariants.size());
if (addUnknownGenotypes) {
addSampleIdsGenotypes(gts, UNKNOWN_GENOTYPE, getSamplesInFile(fileId));
}
logDuplicatedVariant(emptyVar, duplicatedVariants.size(), fileId);
duplicated++;
continue;
}
Binary file = duplicatedVariants.get(0);
Variant variant = VARIANT_CONVERTER_DEFAULT.convertToDataModelType(file);
if (variant.getType().equals(VariantType.NO_VARIATION) || variant.getType().equals(VariantType.SYMBOLIC)) {
mongoDBOps.setSkipped(mongoDBOps.getSkipped() + 1);
skipped++;
continue;
}
if (StringUtils.isNotEmpty(variant.getId()) && !variant.getId().equals(variant.toString())) {
ids.add(variant.getId());
}
if (variant.getNames() != null) {
ids.addAll(variant.getNames());
}
emptyVar.setType(variant.getType());
variant.getStudies().get(0).setSamplesPosition(getSamplesPosition(fileId));
Document newDocument = studyConverter.convertToStorageType(variant, variant.getStudies().get(0));
fileDocuments.add((Document) getListFromDocument(newDocument, FILES_FIELD).get(0));
alternateDocuments = getListFromDocument(newDocument, ALTERNATES_FIELD);
if (newDocument.containsKey(GENOTYPES_FIELD)) {
for (Map.Entry<String, Object> entry : newDocument.get(GENOTYPES_FIELD, Document.class).entrySet()) {
addSampleIdsGenotypes(gts, entry.getKey(), (List<Integer>) entry.getValue());
}
}
} else if (addUnknownGenotypes) {
// logger.debug("File {} not in variant {}", fileId, emptyVar);
addSampleIdsGenotypes(gts, UNKNOWN_GENOTYPE, getSamplesInFile(fileId));
missing++;
}
}
if (newStudy && addUnknownGenotypes) {
//If it is a new variant for the study, add the already loaded samples as UNKNOWN
addSampleIdsGenotypes(gts, UNKNOWN_GENOTYPE, getIndexedSamples());
}
addCleanStageOperations(document, mongoDBOps, newStudy, missing, skipped, duplicated);
updateMongoDBOperations(emptyVar, new ArrayList<>(ids), fileDocuments, alternateDocuments, gts, newStudy, newVariant, mongoDBOps);
}
protected void processOverlappedVariants(List<Document> overlappedVariants, MongoDBOperations mongoDBOps) {
for (Document document : overlappedVariants) {
try {
processOverlappedVariants(document, overlappedVariants, mongoDBOps);
} catch (Exception e) {
Variant mainVariant = STRING_ID_CONVERTER.convertToDataModelType(document);
List<Variant> variants = overlappedVariants.stream()
.map(STRING_ID_CONVERTER::convertToDataModelType)
.collect(Collectors.toList());
logger.error("Error processing variant " + mainVariant + " in overlapped variants " + variants);
throw e;
}
}
}
/**
* Given a list of documents from the stage collection, and one variant from the list of documents,
* merges into the main variant and transforms into a set of MongoDB operations.
*
* It may be a new variant document in the database, a new study in the document, or just an update of an existing study variant.
*
* @param mainDocument Main document to add.
* @param overlappedVariants Overlapping documents from Stage collection.
* @param mongoDBOps Set of MongoDB operations to update
*/
protected void processOverlappedVariants(Document mainDocument, List<Document> overlappedVariants, MongoDBOperations mongoDBOps) {
Variant mainVariant = STRING_ID_CONVERTER.convertToDataModelType(mainDocument);
int variantsWithValidData = getVariantsWithValidData(mainVariant, overlappedVariants);
Document study = mainDocument.get(studyId.toString(), Document.class);
// New variant in the study.
boolean newStudy = isNewStudy(study);
// New variant in the collection if new variant and document size is 2 {_id, study}
boolean newVariant = isNewVariant(mainDocument, newStudy);
// A variant counts as duplicated if is duplicated or missing for all the files.
int duplicatedVariants = 0;
List<String> duplicatedVariantsList = new ArrayList<>();
int duplicatedFiles = 0;
int missingFiles = 0;
for (Integer fileId : fileIds) {
List<Binary> files = getListFromDocument(study, fileId.toString());
if (files == null || files.isEmpty()) {
missingFiles++;
} else if (files.size() > 1) {
duplicatedVariants += files.size();
duplicatedFiles++;
// // If there are more than one variant for this file, increment the number of nonInserted variants.
// // Duplicated variant
logDuplicatedVariant(mainVariant, files.size(), fileId);
for (Binary binary : files) {
Variant duplicatedVariant = VARIANT_CONVERTER_DEFAULT.convertToDataModelType(binary);
String call = duplicatedVariant.getStudies().get(0).getFiles().get(0).getCall();
if (call == null) {
call = duplicatedVariant.toString();
}
duplicatedVariantsList.add(call);
}
}
}
addCleanStageOperations(mainDocument, mongoDBOps, newStudy, missingFiles, 0, duplicatedFiles);
// An overlapping variant will be considered missing if is missing or duplicated for all the files.
final boolean missingOverlappingVariant;
if (duplicatedFiles + missingFiles == fileIds.size()) {
// C3.1), C4.1), C5), B3), D1), D2)
missingOverlappingVariant = true;
if (duplicatedFiles > 0) {
// D1), D2)
logger.error("Duplicated! " + mainVariant + " " + duplicatedVariantsList);
mongoDBOps.setNonInserted(mongoDBOps.getNonInserted() + duplicatedVariants);
}
// No information for this variant
if (newStudy) {
// B3), D1), D2)
return;
}
// else {
// Do not skip. Fill gaps.
// No new overlapped variants.
// }
if (variantsWithValidData != 0) {
// Scenarios C3.1), C4.1)
logger.debug("Missing overlapped variant! {}, {}", fileIds, mainVariant);
mongoDBOps.setOverlappedVariants(mongoDBOps.getOverlappedVariants() + 1);
}
// else {
// If the files to be loaded where not present in the current variant, there is not overlapped variant.
// See scenario C5)
// }
} else {
missingOverlappingVariant = false;
}
// Merge documents
Variant variant = mergeOverlappedVariants(mainVariant, overlappedVariants);
Document gts = new Document();
List<Document> fileDocuments = new LinkedList<>();
List<Document> alternateDocuments = null;
StudyEntry studyEntry = variant.getStudies().get(0);
// For all the files that are being indexed
for (Integer fileId : fileIds) {
FileEntry file = studyEntry.getFile(fileId.toString());
if (file == null) {
file = studyEntry.getFile(String.valueOf(-fileId));
}
if (file != null) {
Document studyDocument = studyConverter.convertToStorageType(variant, studyEntry, file, getSampleNamesInFile(fileId));
if (studyDocument.containsKey(GENOTYPES_FIELD)) {
studyDocument.get(GENOTYPES_FIELD, Document.class)
.forEach((gt, sampleIds) -> addSampleIdsGenotypes(gts, gt, (Collection<Integer>) sampleIds));
}
fileDocuments.addAll(getListFromDocument(studyDocument, FILES_FIELD));
alternateDocuments = getListFromDocument(studyDocument, ALTERNATES_FIELD);
} else if (addUnknownGenotypes) {
addSampleIdsGenotypes(gts, UNKNOWN_GENOTYPE, getSamplesInFile(fileId));
}
}
// For the rest of the files not indexed, only is this variant is new in this study,
// add all the already indexed files information, if present in this variant.
if (newStudy) {
for (Integer fileId : indexedFiles) {
FileEntry file = studyEntry.getFile(fileId.toString());
if (file == null) {
file = studyEntry.getFile(String.valueOf(-fileId));
}
if (file != null) {
Document studyDocument = studyConverter.convertToStorageType(variant, studyEntry, file,
getSampleNamesInFile(fileId));
if (studyDocument.containsKey(GENOTYPES_FIELD)) {
studyDocument.get(GENOTYPES_FIELD, Document.class)
.forEach((gt, sampleIds) -> addSampleIdsGenotypes(gts, gt, (Collection<Integer>) sampleIds));
}
fileDocuments.addAll(getListFromDocument(studyDocument, FILES_FIELD));
} else if (addUnknownGenotypes) {
addSampleIdsGenotypes(gts, UNKNOWN_GENOTYPE, getSamplesInFile(fileId));
}
}
}
updateMongoDBOperations(mainVariant, variant.getIds(), fileDocuments, alternateDocuments, gts, newStudy, newVariant,
mongoDBOps);
}
private void addCleanStageOperations(Document document, MongoDBOperations mongoDBOps, boolean newStudy, int missing,
int skipped, int duplicated) {
if (newStudy && duplicated > 0 && (missing + skipped + duplicated) == fileIds.size()) {
// System.out.println("duplicated: document.getString(\"_id\") = " + document.getString("_id"));
mongoDBOps.getDocumentsToCleanStudies().add(document.getString("_id"));
} else {
if (missing != fileIds.size()) {
mongoDBOps.getDocumentsToCleanFiles().add(document.getString("_id"));
} // else {
// logger.debug("Nothing to clean in variant " + document.getString("_id") + " , " + fileIds);
// }
}
}
private void logDuplicatedVariant(Variant variant, int numDuplicates, Integer fileId) {
logger.warn("Found {} duplicated variants for file {} in variant {}.", numDuplicates, fileId, variant);
}
/**
* Given a collection of documents from the stage collection, returns the number of documents (variants) with valid data.
* i.e. : At least one file not duplicated with information
*
* @param mainVariant Main variant. Only valid data if overlaps with the main variant.
* @param documents Variants from the stage collection
* @return Number of variants with valid data.
*/
private int getVariantsWithValidData(Variant mainVariant, Collection<Document> documents) {
int variantsWithValidData = 0;
for (Document document : documents) {
if (!mainVariant.overlapWith(STRING_ID_CONVERTER.convertToDataModelType(document), true)) {
continue;
}
Document study = document.get(studyIdStr, Document.class);
boolean existingFiles = false;
for (Integer fileId : fileIds) {
List<Binary> files = getListFromDocument(study, fileId.toString());
if (files != null && files.size() == 1) {
existingFiles = true;
break;
}
}
if (existingFiles) {
variantsWithValidData++;
}
}
return variantsWithValidData;
}
/**
* Given a list of overlapped documents from the stage collection, merge resolving the overlapping positions.
*
* If there are any conflict with overlapped positions, will try to select always the mainVariant.
*
* @see {@link VariantMerger}
*
* @param mainVariant Main variant to resolve conflicts.
* @param overlappedVariants Overlapping documents from Stage collection.
* @return For each document, its corresponding merged variant
*/
protected Variant mergeOverlappedVariants(Variant mainVariant, List<Document> overlappedVariants) {
// System.out.println("--------------------------------");
// System.out.println("Overlapped region = " + overlappedVariants
// .stream()
// .map(doc -> STRING_ID_CONVERTER.convertToDataModelType(doc.getString("_id")))
// .collect(Collectors.toList()));
// The overlapping region will be new if any of the variants is new for the study
boolean newOverlappingRegion = false;
// The overlapping region will be completely new if ALL the variants are new for the study
boolean completelyNewOverlappingRegion = true;
Map<Integer, List<Variant>> variantsPerFile = new HashMap<>();
for (Integer fileId : fileIds) {
variantsPerFile.put(fileId, new LinkedList<>());
}
Variant mainVariantNew = null;
List<Variant> variants = new ArrayList<>(overlappedVariants.size());
List<Boolean> newStudies = new ArrayList<>(overlappedVariants.size());
// For each variant, create an empty variant that will be filled by the VariantMerger
for (Document document : overlappedVariants) {
Variant var = STRING_ID_CONVERTER.convertToDataModelType(document);
if (!mainVariant.overlapWith(var, true)) {
// Skip those variants that do not overlap with the given main variant
continue;
}
Document study = document.get(studyIdStr, Document.class);
// New variant in the study.
boolean newStudy = isNewStudy(study);
newStudies.add(newStudy);
// Its a new OverlappingRegion if at least one variant is new in this study
newOverlappingRegion |= newStudy;
// Its a completely new OverlappingRegion if all the variants are new in this study
completelyNewOverlappingRegion &= newStudy;
variants.add(var);
if (sameVariant(var, mainVariant)) {
mainVariantNew = var;
StudyEntry se = new StudyEntry(studyId.toString(), new LinkedList<>(), format);
se.setSamplesPosition(new HashMap<>());
var.addStudyEntry(se);
}
HashSet<String> ids = new HashSet<>();
for (Integer fileId : fileIds) {
List<Binary> files = getListFromDocument(study, fileId.toString());
if (files != null && files.size() == 1) {
// If there is only one variant for this file, add to the map variantsPerFile
Variant variant = VARIANT_CONVERTER_DEFAULT.convertToDataModelType(files.get(0));
variant.getStudies().get(0).setSamplesPosition(getSamplesPosition(fileId));
variantsPerFile.get(fileId).add(variant);
ids.addAll(variant.getIds());
}
}
var.setIds(new ArrayList<>(ids));
}
if (mainVariantNew == null) {
// This should never happen
throw new IllegalStateException("Main variant was not one of the variants to merge");
}
List<Integer> overlappingFiles = new ArrayList<>();
List<Variant> variantsToMerge = new LinkedList<>();
for (Integer fileId : fileIds) {
List<Variant> variantsInFile = variantsPerFile.get(fileId);
switch (variantsInFile.size()) {
case 0:
break;
case 1:
variantsToMerge.add(variantsInFile.get(0));
if (!sameVariant(variantsInFile.get(0), mainVariant)) {
overlappingFiles.add(fileId);
}
break;
default:
// If there are overlapping variants, select the mainVariant if possible.
Variant var = null;
for (Variant variant : variantsInFile) {
if (sameVariant(variant, mainVariant)) {
var = variant;
}
}
// If not found, get the first
if (var == null) {
var = variantsInFile.get(0);
overlappingFiles.add(fileId);
// logger.info("Variant " + mainVariant + " not found in " + variantsInFile);
}
variantsToMerge.add(var);
// Get the original call from the first variant
String call = var.getStudies().get(0).getFiles().get(0).getCall();
if (call != null) {
if (call.isEmpty()) {
call = null;
} else {
call = call.substring(0, call.lastIndexOf(':'));
}
}
// Do not prompt overlapping variants if genotypes are being excluded
if (!excludeGenotypes) {
boolean prompted = false;
for (int i = 1; i < variantsInFile.size(); i++) {
Variant auxVar = variantsInFile.get(i);
// Check if variants where part of the same multiallelic variant
String auxCall = auxVar.getStudies().get(0).getFiles().get(0).getCall();
if (!prompted && (auxCall == null || call == null || !auxCall.startsWith(call))) {
logger.warn("Overlapping variants in file {} : {}", fileId, variantsInFile);
prompted = true;
}
// // Those variants that do not overlap with the selected variant won't be inserted
// if (!auxVar.overlapWith(var, true)) {
// mongoDBOps.nonInserted++;
// logger.warn("Skipping overlapped variant " + auxVar);
// }
}
}
break;
}
}
/*
* If is a new overlapping region and there are some file already indexed
* Fetch the information from the database regarding the loaded variants of this region.
*
* +---+---+---+---+
* | A | B | C | D |
* +----+---+---+---+---+
* | V1 | X | X | | |
* +----+---+---+---+---+
* | V2 | X | X | | X |
* +----+---+---+---+---+
* | V3 | | | X | |
* +----+---+---+---+---+
*
* - Files A and B are loaded
* - Files C and D are being loaded
* - Variants V1,V2,V3 are overlapping
*
* In order to merge the data properly, we need to get from the server the information about
* the variants {V1, V2} for the files {A, B}.
*
* Because the variants {V1, V2} are already loaded, the information that we need is duplicated
* in both variants, so we only need to get one of them.
*
*/
if (!completelyNewOverlappingRegion && newOverlappingRegion && !indexedFiles.isEmpty()) {
int i = 0;
for (Variant variant : variants) {
// If the variant is not new in this study, query to the database for the loaded info.
if (!newStudies.get(i)) {
QueryResult<Variant> queryResult = fetchVariant(variant);
if (queryResult.getResult().size() == 1 && queryResult.first().getStudies().size() == 1) {
// Check if overlapping variant. If so, invert!
for (FileEntry fileEntry : queryResult.first().getStudies().get(0).getFiles()) {
boolean empty = StringUtils.isEmpty(fileEntry.getCall());
if (empty && !sameVariant(mainVariant, queryResult.first())
|| !empty && !sameVariant(mainVariant, fileEntry.getCall())) {
markAsOverlapped(fileEntry);
} else {
markAsNonOverlapped(fileEntry);
}
}
variantsToMerge.add(queryResult.first());
} else {
if (queryResult.getResult().isEmpty()) {
throw new IllegalStateException("Variant " + variant + " not found!");
} else {
throw new IllegalStateException("Variant " + variant + " found wrong! : " + queryResult.getResult());
}
}
// Because the loaded variants were an overlapped region, all the information required is in every variant.
// Fetch only one variant
break;
}
i++;
}
}
// Finally, merge variants
variantMerger.merge(mainVariantNew, variantsToMerge);
if (!overlappingFiles.isEmpty()) {
for (FileEntry fileEntry : mainVariantNew.getStudies().get(0).getFiles()) {
int fileId = Integer.parseInt(fileEntry.getFileId());
if (overlappingFiles.contains(fileId)) {
markAsOverlapped(fileEntry);
}
}
}
return mainVariantNew;
}
private void markAsOverlapped(FileEntry fileEntry) {
int fid = Integer.parseInt(fileEntry.getFileId());
if (fid > 0) {
fileEntry.setFileId(String.valueOf(-fid));
}
}
private void markAsNonOverlapped(FileEntry fileEntry) {
int fid = Integer.parseInt(fileEntry.getFileId());
if (fid < 0) {
fileEntry.setFileId(String.valueOf(-fid));
}
}
/**
* Reads the given variant from the 'variants' collection.
*
* It may happen that, 3s of default timeout, is not enough if there is a
* lot of writes at the same time in the "variants" collection. Also add a
* retry, just in case.
* @param variant Variant to read
* @return Query result of the query
*/
private QueryResult<Variant> fetchVariant(Variant variant) {
QueryResult<Variant> queryResult = null;
int maxNumFails = 2;
int fails = 0;
while (queryResult == null) {
try {
queryResult = dbAdaptor.get(new Query()
.append(VariantDBAdaptor.VariantQueryParams.ID.key(), variant.toString())
.append(VariantDBAdaptor.VariantQueryParams.UNKNOWN_GENOTYPE.key(), ".")
.append(VariantDBAdaptor.VariantQueryParams.RETURNED_STUDIES.key(), studyId),
new QueryOptions(QueryOptions.TIMEOUT, 30_000));
} catch (MongoExecutionTimeoutException e) {
fails++;
if (fails < maxNumFails) {
logger.warn("Got timeout exception reading variants. Retry!", e);
} else {
throw e;
}
}
}
return queryResult;
}
/**
* Transform the set of genotypes and file objects into a set of mongodb operations.
*
* @param emptyVar Parsed empty variant of the document. Only chr, pos, ref, alt
* @param ids Variant identifiers seen for this variant
* @param fileDocuments List of files to be updated
* @param secondaryAlternates SecondaryAlternates documents.
* @param gts Set of genotypes to be updates
* @param newStudy If the variant is new for this study
* @param newVariant If the variant was never seen in the database
* @param mongoDBOps Set of MongoBD operations to update
*/
protected void updateMongoDBOperations(Variant emptyVar, List<String> ids, List<Document> fileDocuments,
List<Document> secondaryAlternates, Document gts, boolean newStudy, boolean newVariant,
MongoDBOperations mongoDBOps) {
if (newStudy) {
// If there where no files and the study is new, do not add a new study.
// It may happen if all the files in the variant where duplicated for this variant.
if (!fileDocuments.isEmpty()) {
Document studyDocument = new Document(STUDYID_FIELD, studyId)
.append(FILES_FIELD, fileDocuments);
if (!excludeGenotypes) {
studyDocument.append(GENOTYPES_FIELD, gts);
}
if (secondaryAlternates != null && !secondaryAlternates.isEmpty()) {
studyDocument.append(ALTERNATES_FIELD, secondaryAlternates);
}
final String id;
List<Bson> updates = new ArrayList<>();
updates.add(push(STUDIES_FIELD, studyDocument));
if (newVariant) {
Document variantDocument = variantConverter.convertToStorageType(emptyVar);
updates.add(addEachToSet(IDS_FIELD, ids));
for (Map.Entry<String, Object> entry : variantDocument.entrySet()) {
if (!entry.getKey().equals("_id") && !entry.getKey().equals(STUDIES_FIELD) && !entry.getKey().equals(IDS_FIELD)) {
Object value = entry.getValue();
if (value instanceof List) {
updates.add(setOnInsert(entry.getKey(), new BsonArray(((List) value))));
} else {
updates.add(setOnInsert(entry.getKey(), value));
}
}
}
mongoDBOps.getNewStudy().getVariants().add(variantDocument);
id = variantDocument.getString("_id");
} else {
id = variantConverter.buildStorageId(emptyVar);
}
mongoDBOps.getNewStudy().getIds().add(id);
mongoDBOps.getNewStudy().getQueries().add(eq("_id", id));
mongoDBOps.getNewStudy().getUpdates().add(combine(updates));
}
} else {
String id = variantConverter.buildStorageId(emptyVar);
List<Bson> mergeUpdates = new LinkedList<>();
if (!ids.isEmpty()) {
mergeUpdates.add(addEachToSet(IDS_FIELD, ids));
}
if (!excludeGenotypes) {
for (String gt : gts.keySet()) {
List sampleIds = getListFromDocument(gts, gt);
if (resume) {
mergeUpdates.add(addEachToSet(STUDIES_FIELD + ".$." + GENOTYPES_FIELD + '.' + gt,
sampleIds));
} else {
mergeUpdates.add(pushEach(STUDIES_FIELD + ".$." + GENOTYPES_FIELD + '.' + gt,
sampleIds));
}
}
}
if (secondaryAlternates != null && !secondaryAlternates.isEmpty()) {
mergeUpdates.add(addEachToSet(STUDIES_FIELD + ".$." + ALTERNATES_FIELD, secondaryAlternates));
}
if (!fileDocuments.isEmpty()) {
mongoDBOps.getExistingStudy().getIds().add(id);
mongoDBOps.getExistingStudy().getQueries().add(and(eq("_id", id),
eq(STUDIES_FIELD + '.' + STUDYID_FIELD, studyId)));
if (resume) {
mergeUpdates.add(addEachToSet(STUDIES_FIELD + ".$." + FILES_FIELD, fileDocuments));
} else {
mergeUpdates.add(pushEach(STUDIES_FIELD + ".$." + FILES_FIELD, fileDocuments));
}
mongoDBOps.getExistingStudy().getUpdates().add(combine(mergeUpdates));
} else if (!mergeUpdates.isEmpty()) {
// These files are not present in this variant. Increase the number of missing variants.
mongoDBOps.setMissingVariants(mongoDBOps.getMissingVariants() + 1);
mongoDBOps.getExistingStudy().getIds().add(id);
mongoDBOps.getExistingStudy().getQueries().add(and(eq("_id", id),
eq(STUDIES_FIELD + '.' + STUDYID_FIELD, studyId)));
mongoDBOps.getExistingStudy().getUpdates().add(combine(mergeUpdates));
} else {
mongoDBOps.setMissingVariantsNoFillGaps(mongoDBOps.getMissingVariantsNoFillGaps() + 1);
}
}
}
/**
* Is a new variant for the study depending on the value of the field {@link MongoDBVariantStageLoader#NEW_STUDY_FIELD}.
* @param study Study object
* @return If this is the first time that the variant has been seen in this study.
*/
public static boolean isNewStudy(Document study) {
return study.getBoolean(MongoDBVariantStageLoader.NEW_STUDY_FIELD, MongoDBVariantStageLoader.NEW_STUDY_DEFAULT);
}
public static boolean isNewVariant(Document document, boolean newStudy) {
// If the document has only the study, _id, end, ref and alt fields.
if (!newStudy || document.size() != 5) {
for (Map.Entry<String, Object> entry : document.entrySet()) {
if (!entry.getKey().equals(VariantStringIdConverter.ID_FIELD)
&& !entry.getKey().equals(VariantStringIdConverter.END_FIELD)
&& !entry.getKey().equals(VariantStringIdConverter.REF_FIELD)
&& !entry.getKey().equals(VariantStringIdConverter.ALT_FIELD)) {
if (!isNewStudy((Document) entry.getValue())) {
return false;
}
}
}
}
return true;
}
private boolean sameVariant(Variant variant, Variant other) {
return variant.getChromosome().equals(other.getChromosome())
&& variant.getStart().equals(other.getStart())
&& variant.getReference().equals(other.getReference())
&& variant.getAlternate().equals(other.getAlternate());
}
private boolean sameVariant(Variant variant, String call) {
String[] split = call.split(":", -1);
List<VariantNormalizer.VariantKeyFields> normalized = new VariantNormalizer()
.normalize(variant.getChromosome(), Integer.parseInt(split[0]), split[1], Arrays.asList(split[2].split(",")));
for (VariantNormalizer.VariantKeyFields variantKeyFields : normalized) {
if (variantKeyFields.getStart() == variant.getStart()
&& variantKeyFields.getReference().equals(variant.getReference())
&& variantKeyFields.getAlternate().equals(variant.getAlternate())) {
return true;
}
}
return false;
}
protected void addSampleIdsGenotypes(Document gts, String genotype, Collection<Integer> sampleIds) {
if (sampleIds.isEmpty()) {
return;
}
if (gts.containsKey(genotype)) {
getListFromDocument(gts, genotype).addAll(sampleIds);
} else {
gts.put(genotype, new LinkedList<>(sampleIds));
}
}
@SuppressWarnings("unchecked")
private <T> List<T> getListFromDocument(Document document, String key) {
return document.get(key, List.class);
}
protected List<Integer> getIndexedSamples() {
return indexedSamples;
}
private List<Integer> buildIndexedSamplesList(List<Integer> fileIds) {
List<Integer> indexedSamples = new LinkedList<>(StudyConfiguration.getIndexedSamples(studyConfiguration).values());
for (Integer fileId : fileIds) {
indexedSamples.removeAll(getSamplesInFile(fileId));
}
indexedSamples.sort(Integer::compareTo);
return indexedSamples;
}
protected LinkedHashSet<Integer> getSamplesInFile(Integer fileId) {
return studyConfiguration.getSamplesInFiles().get(fileId);
}
protected LinkedHashSet<String> getSampleNamesInFile(Integer fileId) {
LinkedHashSet<String> samples = new LinkedHashSet<>();
getSamplesInFile(fileId).forEach(sampleId -> {
samples.add(studyConfiguration.getSampleIds().inverse().get(sampleId));
});
return samples;
}
protected LinkedHashMap<String, Integer> getSamplesPosition(Integer fileId) {
if (!samplesPositionMap.containsKey(fileId)) {
synchronized (samplesPositionMap) {
if (!samplesPositionMap.containsKey(fileId)) {
LinkedHashMap<String, Integer> samplesPosition = new LinkedHashMap<>();
for (Integer sampleId : studyConfiguration.getSamplesInFiles().get(fileId)) {
samplesPosition.put(studyConfiguration.getSampleIds().inverse().get(sampleId), samplesPosition.size());
}
samplesPositionMap.put(fileId, samplesPosition);
}
}
}
return samplesPositionMap.get(fileId);
}
public List<String> buildFormat(StudyConfiguration studyConfiguration) {
List<String> format = new LinkedList<>();
if (!excludeGenotypes) {
format.add(VariantMerger.GT_KEY);
}
format.addAll(studyConfiguration.getAttributes().getAsStringList(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key()));
return format;
}
public boolean getExcludeGenotypes(StudyConfiguration studyConfiguration) {
return studyConfiguration.getAttributes().getBoolean(VariantStorageEngine.Options.EXCLUDE_GENOTYPES.key(),
VariantStorageEngine.Options.EXCLUDE_GENOTYPES.defaultValue());
}
}