/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.mongodb.variant.load.stage;
import com.google.common.collect.ListMultimap;
import com.mongodb.ErrorCategory;
import com.mongodb.MongoBulkWriteException;
import com.mongodb.bulk.BulkWriteError;
import com.mongodb.bulk.BulkWriteResult;
import org.bson.Document;
import org.bson.conversions.Bson;
import org.bson.types.Binary;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.commons.datastore.core.ComplexTypeConverter;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.mongodb.MongoDBCollection;
import org.opencb.commons.io.DataWriter;
import org.opencb.opencga.storage.mongodb.variant.converters.VariantStringIdConverter;
import org.opencb.opencga.storage.mongodb.variant.converters.stage.VariantToAvroBinaryConverter;
import org.opencb.opencga.storage.mongodb.variant.load.MongoDBVariantWriteResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.mongodb.client.model.Filters.*;
import static com.mongodb.client.model.Updates.*;
import static org.opencb.opencga.storage.mongodb.variant.converters.VariantStringIdConverter.*;
/**
* Created on 07/04/16.
*
* @author Jacobo Coll <jacobo167@gmail.com>
*/
public class MongoDBVariantStageLoader implements DataWriter<ListMultimap<Document, Binary>> {
public static final String NEW_STUDY_FIELD = "new";
public static final boolean NEW_STUDY_DEFAULT = true;
private static final QueryOptions QUERY_OPTIONS = new QueryOptions(MongoDBCollection.UPSERT, true);
public static final Pattern DUP_KEY_WRITE_RESULT_ERROR_PATTERN = Pattern.compile("^.*dup key: \\{ : \"([^\"]*)\" \\}$");
private final MongoDBCollection collection;
private final String fieldName;
private final boolean resumeStageLoad;
private static final Logger LOGGER = LoggerFactory.getLogger(MongoDBVariantStageLoader.class);
private final MongoDBVariantWriteResult writeResult = new MongoDBVariantWriteResult();
public static final ComplexTypeConverter<Variant, Binary> VARIANT_CONVERTER_DEFAULT = new VariantToAvroBinaryConverter();
public static final VariantStringIdConverter STRING_ID_CONVERTER = new VariantStringIdConverter();
public MongoDBVariantStageLoader(MongoDBCollection collection, int studyId, int fileId, boolean resumeStageLoad) {
this.collection = collection;
fieldName = studyId + "." + fileId;
this.resumeStageLoad = resumeStageLoad;
}
@Override
public boolean write(List<ListMultimap<Document, Binary>> batch) {
for (ListMultimap<Document, Binary> map : batch) {
insert(map);
}
return true;
}
public MongoDBVariantWriteResult insert(ListMultimap<Document, Binary> ids) {
final long start = System.nanoTime();
MongoDBVariantWriteResult result = new MongoDBVariantWriteResult();
Set<String> retryKeys = updateMongo(ids, result, null);
if (!retryKeys.isEmpty()) {
updateMongo(ids, result, retryKeys);
}
result.setNewVariantsNanoTime(System.nanoTime() - start);
// result.setSkippedVariants(skippedVariants);
synchronized (writeResult) {
writeResult.merge(result);
}
return result;
}
/**
* Given a map of id -> binary[], inserts the binary objects in the stage collection.
*
* {
* <studyId> : {
* <fileId> : [ BinData(), BinData() ]
* }
* }
*
* The field <fileId> is an array to detect duplicated variants within the same file.
*
* It may happen that an update with upsert:true fail if two different threads try to
* update the same non existing document.
* See https://jira.mongodb.org/browse/SERVER-14322
*
* In that case, the non inserted values will be returned.
*
* @param values Map with all the values to insert
* @param result MongoDBVariantWriteResult to fill
* @param retryIds List of IDs to retry. If not null, only will update those documents within this set
* @return List of non updated documents.
* @throws MongoBulkWriteException if the exception was not a DuplicatedKeyException (e:11000)
*/
private Set<String> updateMongo(ListMultimap<Document, Binary> values, MongoDBVariantWriteResult result, Set<String> retryIds) {
Set<String> nonInsertedIds = Collections.emptySet();
if (values.isEmpty()) {
return nonInsertedIds;
}
List<Bson> queries = new LinkedList<>();
List<Bson> updates = new LinkedList<>();
for (Document id : values.keySet()) {
if (retryIds == null || retryIds.contains(id.getString("_id"))) {
List<Binary> binaryList = values.get(id);
queries.add(eq("_id", id.getString("_id")));
if (binaryList.size() == 1) {
updates.add(combine(resumeStageLoad ? addToSet(fieldName, binaryList.get(0)) : push(fieldName, binaryList.get(0)),
setOnInsert(END_FIELD, id.get(END_FIELD)),
setOnInsert(REF_FIELD, id.get(REF_FIELD)),
setOnInsert(ALT_FIELD, id.get(ALT_FIELD))));
} else {
updates.add(combine(resumeStageLoad ? addEachToSet(fieldName, binaryList) : pushEach(fieldName, binaryList),
setOnInsert(END_FIELD, id.get(END_FIELD)),
setOnInsert(REF_FIELD, id.get(REF_FIELD)),
setOnInsert(ALT_FIELD, id.get(ALT_FIELD))));
}
}
}
try {
final BulkWriteResult mongoResult = collection.update(queries, updates, QUERY_OPTIONS).first();
result.setNewVariants(mongoResult.getInsertedCount())
.setUpdatedVariants(mongoResult.getModifiedCount());
} catch (MongoBulkWriteException e) {
result.setNewVariants(e.getWriteResult().getInsertedCount())
.setUpdatedVariants(e.getWriteResult().getModifiedCount());
if (retryIds != null) {
// If retryIds != null, means that this this was the second attempt to update. In this case, do fail.
LOGGER.error("BulkWriteErrors when retrying the updates");
throw e;
}
nonInsertedIds = new HashSet<>();
for (BulkWriteError writeError : e.getWriteErrors()) {
if (ErrorCategory.fromErrorCode(writeError.getCode()).equals(ErrorCategory.DUPLICATE_KEY)) { //Dup Key error code
Matcher matcher = DUP_KEY_WRITE_RESULT_ERROR_PATTERN.matcher(writeError.getMessage());
if (matcher.find()) {
String id = matcher.group(1);
nonInsertedIds.add(id);
LOGGER.warn("Catch error : {}", writeError.toString());
LOGGER.warn("DupKey exception inserting '{}'. Retry!", id);
} else {
LOGGER.error("WriteError with code {} does not match with the pattern {}",
writeError.getCode(), DUP_KEY_WRITE_RESULT_ERROR_PATTERN.pattern());
throw e;
}
} else {
throw e;
}
}
}
return nonInsertedIds;
}
public static long cleanStageCollection(MongoDBCollection stageCollection, int studyId, int fileId) {
//Delete those studies that have duplicated variants. Those are not inserted, so they are not new variants.
long modifiedCount = stageCollection.update(
and(exists(studyId + "." + fileId + ".1"), exists(studyId + "." + NEW_STUDY_FIELD, false)),
unset(Integer.toString(studyId)),
new QueryOptions(MongoDBCollection.MULTI, true)).first().getModifiedCount();
modifiedCount += stageCollection.update(
exists(studyId + "." + fileId),
combine(
// unset(studyId + "." + fileId),
set(studyId + "." + fileId, null),
set(studyId + "." + NEW_STUDY_FIELD, false)
), new QueryOptions(MongoDBCollection.MULTI, true)).first().getModifiedCount();
return modifiedCount;
}
public static long cleanStageCollection(MongoDBCollection stageCollection, int studyId, List<Integer> fileIds,
Collection<String> chromosomes, MongoDBVariantWriteResult result) {
boolean removeDuplicatedVariants = result == null || result.getNonInsertedVariants() > 0;
// Delete those new studies that have duplicated variants. Those are not inserted, so they are not new variants.
// i.e: For each file, or the file has not been loaded (empty), or the file has more than one element.
// { $or : [ { <study>.<file>.0 : {$exists:false} }, { <study>.<file>.1 : {$exists:true} } ] }
List<Bson> filters = new ArrayList<>();
long modifiedCount = 0;
Bson chrFilter;
if (chromosomes != null && !chromosomes.isEmpty()) {
List<Bson> chrFilters = new ArrayList<>();
for (String chromosome : chromosomes) {
MongoDBVariantStageReader.addChromosomeFilter(chrFilters, chromosome);
}
chrFilter = or(chrFilters);
} else {
chrFilter = new Document();
}
if (removeDuplicatedVariants) {
// TODO: This variants should be removed while loading data. This operation is taking too much time.
filters.add(exists(studyId + "." + NEW_STUDY_FIELD, false));
for (Integer fileId : fileIds) {
filters.add(or(exists(studyId + "." + fileId + ".0", false), exists(studyId + "." + fileId + ".1")));
}
LOGGER.info("Clean studies from stage where all the files where duplicated");
modifiedCount += stageCollection.update(
and(chrFilter, and(filters)), unset(Integer.toString(studyId)),
new QueryOptions(MongoDBCollection.MULTI, true)).first().getModifiedCount();
}
filters.clear();
List<Bson> updates = new LinkedList<>();
for (Integer fileId : fileIds) {
filters.add(exists(studyId + "." + fileId));
// updates.add(unset(studyId + "." + fileId));
updates.add(set(studyId + "." + fileId, null));
}
updates.add(set(studyId + "." + NEW_STUDY_FIELD, false));
LOGGER.info("Cleaning files {} from stage collection", fileIds);
modifiedCount += stageCollection.update(and(chrFilter, or(filters)), combine(updates),
new QueryOptions(MongoDBCollection.MULTI, true)).first().getModifiedCount();
return modifiedCount;
}
public MongoDBVariantWriteResult getWriteResult() {
return writeResult;
}
}