package org.opencb.opencga.storage.mongodb.variant.load.variants;
import com.mongodb.ErrorCategory;
import com.mongodb.MongoBulkWriteException;
import com.mongodb.bulk.BulkWriteError;
import com.mongodb.bulk.BulkWriteResult;
import org.apache.commons.lang3.time.StopWatch;
import org.bson.Document;
import org.bson.conversions.Bson;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryResult;
import org.opencb.commons.datastore.mongodb.MongoDBCollection;
import org.opencb.commons.io.DataWriter;
import org.opencb.opencga.core.common.ProgressLogger;
import org.opencb.opencga.storage.mongodb.variant.adaptors.VariantMongoDBAdaptor;
import org.opencb.opencga.storage.mongodb.variant.load.MongoDBVariantWriteResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.regex.Matcher;
import static com.mongodb.client.model.Filters.*;
import static com.mongodb.client.model.Updates.*;
import static org.opencb.opencga.storage.mongodb.variant.converters.DocumentToStudyVariantEntryConverter.FILEID_FIELD;
import static org.opencb.opencga.storage.mongodb.variant.converters.DocumentToStudyVariantEntryConverter.FILES_FIELD;
import static org.opencb.opencga.storage.mongodb.variant.converters.DocumentToVariantConverter.STUDIES_FIELD;
import static org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageLoader.DUP_KEY_WRITE_RESULT_ERROR_PATTERN;
import static org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageLoader.NEW_STUDY_FIELD;
/**
* Created on 21/11/16.
*
* @author Jacobo Coll <jacobo167@gmail.com>
*/
public class MongoDBVariantMergeLoader implements DataWriter<MongoDBOperations> {
private final Logger logger = LoggerFactory.getLogger(MongoDBVariantMergeLoader.class);
private static final QueryOptions QUERY_OPTIONS = new QueryOptions();
private static final QueryOptions UPSERT_AND_RELPACE = new QueryOptions(MongoDBCollection.UPSERT, true)
.append(MongoDBCollection.REPLACE, true);
private static final QueryOptions UPSERT = new QueryOptions(MongoDBCollection.UPSERT, true);
private final ProgressLogger progressLogger;
private final MongoDBCollection variantsCollection;
private final MongoDBCollection stageCollection;
private final boolean resume;
private final boolean cleanWhileLoading;
private final Integer studyId;
/** Files to be loaded. */
private final List<Integer> fileIds;
// Variables that must be aware of concurrent modification
private final MongoDBVariantWriteResult result;
public MongoDBVariantMergeLoader(MongoDBCollection variantsCollection, MongoDBCollection stageCollection,
Integer studyId, List<Integer> fileIds, boolean resume, boolean cleanWhileLoading,
ProgressLogger progressLogger) {
this.progressLogger = progressLogger;
this.variantsCollection = variantsCollection;
this.stageCollection = stageCollection;
this.resume = resume;
this.studyId = studyId;
this.fileIds = fileIds;
this.result = new MongoDBVariantWriteResult();
this.cleanWhileLoading = cleanWhileLoading;
}
@Override
public boolean write(List<MongoDBOperations> batch) {
for (MongoDBOperations mongoDBOperations : batch) {
executeMongoDBOperations(mongoDBOperations);
}
return true;
}
public MongoDBVariantWriteResult getResult() {
return result;
}
/**
* Execute the set of mongoDB operations.
*
* @param mongoDBOps MongoDB operations to execute
* @return MongoDBVariantWriteResult
*/
protected MongoDBVariantWriteResult executeMongoDBOperations(MongoDBOperations mongoDBOps) {
long newVariantsTime = 0; // Impossible to know how much time spend in insert or update in operation "UPSERT"
StopWatch existingVariants = StopWatch.createStarted();
long newVariants = 0;
if (!mongoDBOps.getNewStudy().getQueries().isEmpty()) {
newVariants = executeMongoDBOperationsNewStudy(mongoDBOps, true);
}
existingVariants.stop();
StopWatch fillGapsVariants = StopWatch.createStarted();
if (!mongoDBOps.getExistingStudy().getQueries().isEmpty()) {
QueryResult<BulkWriteResult> update = variantsCollection.update(mongoDBOps.getExistingStudy().getQueries(),
mongoDBOps.getExistingStudy().getUpdates(), QUERY_OPTIONS);
if (update.first().getMatchedCount() != mongoDBOps.getExistingStudy().getQueries().size()) {
onUpdateError("fill gaps", update, mongoDBOps.getExistingStudy().getQueries(), mongoDBOps.getExistingStudy().getIds());
}
}
fillGapsVariants.stop();
if (cleanWhileLoading) {
cleanStage(mongoDBOps);
}
long updatesNewStudyExistingVariant = mongoDBOps.getNewStudy().getUpdates().size() - newVariants;
long updatesWithDataExistingStudy = mongoDBOps.getExistingStudy().getUpdates().size() - mongoDBOps.getMissingVariants();
MongoDBVariantWriteResult writeResult = new MongoDBVariantWriteResult(newVariants,
updatesNewStudyExistingVariant + updatesWithDataExistingStudy, mongoDBOps.getMissingVariants(),
mongoDBOps.getOverlappedVariants(), mongoDBOps.getSkipped(), mongoDBOps.getNonInserted(), newVariantsTime,
existingVariants.getNanoTime(), fillGapsVariants.getNanoTime());
synchronized (result) {
result.merge(writeResult);
}
long processedVariants = mongoDBOps.getNewStudy().getQueries().size()
+ mongoDBOps.getExistingStudy().getQueries().size()
+ mongoDBOps.getMissingVariantsNoFillGaps();
logProgress(processedVariants);
return writeResult;
}
private long cleanStage(MongoDBOperations mongoDBOps) {
long modifiedCount = 0;
if (!mongoDBOps.getDocumentsToCleanStudies().isEmpty()) {
logger.debug("Clean study {} from stage where all the files {} where duplicated : {}", studyId, fileIds,
mongoDBOps.getDocumentsToCleanStudies());
modifiedCount += stageCollection.update(
in("_id", mongoDBOps.getDocumentsToCleanStudies()), unset(String.valueOf(studyId)),
new QueryOptions(MongoDBCollection.MULTI, true)).first().getModifiedCount();
}
if (!mongoDBOps.getDocumentsToCleanFiles().isEmpty()) {
logger.debug("Cleaning files {} from stage collection", fileIds);
List<Bson> fileUpdates = new LinkedList<>();
for (Integer fileId : fileIds) {
// fileUpdates.add(unset(studyId + "." + fileId));
fileUpdates.add(set(studyId + "." + fileId, null));
}
fileUpdates.add(set(studyId + "." + NEW_STUDY_FIELD, false));
modifiedCount += stageCollection.update(in("_id", mongoDBOps.getDocumentsToCleanFiles()), combine(fileUpdates),
new QueryOptions(MongoDBCollection.MULTI, true)).first().getModifiedCount();
}
return modifiedCount;
}
private int executeMongoDBOperationsNewStudy(MongoDBOperations mongoDBOps, boolean retry) {
int newVariants = 0;
MongoDBOperations.NewStudy newStudy = mongoDBOps.getNewStudy();
try {
if (resume) {
// Ensure files exists
try {
if (!newStudy.getVariants().isEmpty()) {
newVariants += newStudy.getVariants().size();
variantsCollection.insert(newStudy.getVariants(), QUERY_OPTIONS);
}
} catch (MongoBulkWriteException e) {
for (BulkWriteError writeError : e.getWriteErrors()) {
if (!ErrorCategory.fromErrorCode(writeError.getCode()).equals(ErrorCategory.DUPLICATE_KEY)) {
throw e;
} else {
// Not inserted variant
newVariants--;
}
}
}
// Update
List<Bson> queriesExisting = new ArrayList<>(newStudy.getQueries().size());
for (Bson bson : newStudy.getQueries()) {
queriesExisting.add(and(bson, nin(STUDIES_FIELD + "." + FILES_FIELD + "." + FILEID_FIELD, fileIds)));
}
// Update those existing variants
QueryResult<BulkWriteResult> update = variantsCollection.update(queriesExisting, newStudy.getUpdates(), QUERY_OPTIONS);
// if (update.first().getModifiedCount() != mongoDBOps.queriesExisting.size()) {
// // FIXME: Don't know if there is some error inserting. Query already existing?
// onUpdateError("existing variants", update, mongoDBOps.queriesExisting, mongoDBOps.queriesExistingId);
// }
} else {
QueryResult<BulkWriteResult> update = variantsCollection.update(newStudy.getQueries(), newStudy.getUpdates(), UPSERT);
if (update.first().getModifiedCount() + update.first().getUpserts().size() != newStudy.getQueries().size()) {
onUpdateError("existing variants", update, newStudy.getQueries(), newStudy.getIds());
}
// Add upserted documents
newVariants += update.first().getUpserts().size();
}
} catch (MongoBulkWriteException e) {
// Add upserted documents
newVariants += e.getWriteResult().getUpserts().size();
Set<String> duplicatedNonInsertedId = new HashSet<>();
for (BulkWriteError writeError : e.getWriteErrors()) {
if (!ErrorCategory.fromErrorCode(writeError.getCode()).equals(ErrorCategory.DUPLICATE_KEY)) {
throw e;
} else {
Matcher matcher = DUP_KEY_WRITE_RESULT_ERROR_PATTERN.matcher(writeError.getMessage());
if (matcher.find()) {
String id = matcher.group(1);
duplicatedNonInsertedId.add(id);
logger.warn("Catch error : {}", writeError.toString());
logger.warn("DupKey exception inserting '{}'. Retry!", id);
} else {
logger.error("WriteError with code {} does not match with the pattern {}",
writeError.getCode(), DUP_KEY_WRITE_RESULT_ERROR_PATTERN.pattern());
throw e;
}
}
}
if (retry) {
// Retry once!
// With UPSERT=true, this command should never throw DuplicatedKeyException.
// See https://jira.mongodb.org/browse/SERVER-14322
// Remove inserted variants
logger.warn("Retry! " + e);
Iterator<String> iteratorId = newStudy.getIds().iterator();
Iterator<?> iteratorQuery = newStudy.getQueries().iterator();
Iterator<?> iteratorUpdate = newStudy.getUpdates().iterator();
while (iteratorId.hasNext()) {
String id = iteratorId.next();
iteratorQuery.next();
iteratorUpdate.next();
if (!duplicatedNonInsertedId.contains(id)) {
iteratorId.remove();
iteratorQuery.remove();
iteratorUpdate.remove();
}
}
newVariants += executeMongoDBOperationsNewStudy(mongoDBOps, false);
} else {
throw e;
}
}
return newVariants;
}
protected void onUpdateError(String updateName, QueryResult<BulkWriteResult> update, List<Bson> queries, List<String> queryIds) {
logger.error("(Updated " + updateName + " variants = " + queries.size() + " ) != "
+ "(ModifiedCount = " + update.first().getModifiedCount() + "). MatchedCount:" + update.first().getMatchedCount());
logger.info("QueryIDs: {}", queryIds);
List<QueryResult<Document>> queryResults = variantsCollection.find(queries, null);
logger.info("Results: ", queryResults.size());
for (QueryResult<Document> r : queryResults) {
logger.info("result: ", r);
if (!r.getResult().isEmpty()) {
String id = r.first().get("_id", String.class);
boolean remove = queryIds.remove(id);
logger.info("remove({}): {}", id, remove);
}
}
StringBuilder sb = new StringBuilder("Missing Variant for update : ");
for (String id : queryIds) {
logger.error("Missing Variant " + id);
sb.append(id).append(", ");
}
throw new RuntimeException(sb.toString());
}
protected void logProgress(long processedVariants) {
if (progressLogger != null) {
progressLogger.increment(processedVariants);
}
}
@Override
public boolean post() {
VariantMongoDBAdaptor.createIndexes(new QueryOptions(), variantsCollection);
return true;
}
// protected void onInsertError(MongoDBOperations mongoDBOps, BulkWriteResult writeResult) {
// logger.error("(Inserts = " + mongoDBOps.inserts.size() + ") "
// + "!= (InsertedCount = " + writeResult.getInsertedCount() + ")");
//
// StringBuilder sb = new StringBuilder("Missing Variant for insert : ");
// for (Document insert : mongoDBOps.inserts) {
// Long count = collection.count(eq("_id", insert.get("_id"))).first();
// if (count != 1) {
// logger.error("Missing insert " + insert.get("_id"));
// sb.append(insert.get("_id")).append(", ");
// }
// }
// throw new RuntimeException(sb.toString());
// }
}