/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.mongodb.variant.adaptors;
import com.google.common.base.Throwables;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.mongodb.*;
import com.mongodb.bulk.BulkWriteError;
import com.mongodb.bulk.BulkWriteResult;
import com.mongodb.client.FindIterable;
import com.mongodb.client.model.Updates;
import com.mongodb.client.result.UpdateResult;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.lang3.time.StopWatch;
import org.bson.Document;
import org.bson.conversions.Bson;
import org.bson.json.JsonMode;
import org.bson.json.JsonWriterSettings;
import org.opencb.biodata.models.core.Region;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.AdditionalAttribute;
import org.opencb.biodata.models.variant.avro.VariantAnnotation;
import org.opencb.biodata.models.variant.avro.VariantType;
import org.opencb.biodata.models.variant.stats.VariantStats;
import org.opencb.cellbase.client.config.ClientConfiguration;
import org.opencb.cellbase.client.rest.CellBaseClient;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryResult;
import org.opencb.commons.datastore.mongodb.MongoDBCollection;
import org.opencb.commons.datastore.mongodb.MongoDataStore;
import org.opencb.commons.datastore.mongodb.MongoDataStoreManager;
import org.opencb.commons.io.DataWriter;
import org.opencb.opencga.core.results.VariantQueryResult;
import org.opencb.opencga.storage.core.cache.CacheManager;
import org.opencb.opencga.storage.core.config.CellBaseConfiguration;
import org.opencb.opencga.storage.core.config.StorageConfiguration;
import org.opencb.opencga.storage.core.config.StorageEngineConfiguration;
import org.opencb.opencga.storage.core.exceptions.VariantSearchException;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager;
import org.opencb.opencga.storage.core.search.VariantSearchManager;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.adaptors.*;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptorUtils.*;
import org.opencb.opencga.storage.core.variant.annotation.VariantAnnotationManager;
import org.opencb.opencga.storage.core.variant.annotation.annotators.AbstractCellBaseVariantAnnotator;
import org.opencb.opencga.storage.core.variant.stats.VariantStatsWrapper;
import org.opencb.opencga.storage.mongodb.auth.MongoCredentials;
import org.opencb.opencga.storage.mongodb.variant.MongoDBVariantStorageEngine;
import org.opencb.opencga.storage.mongodb.variant.converters.*;
import org.opencb.opencga.storage.mongodb.variant.load.MongoDBVariantWriteResult;
import org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageLoader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import static org.opencb.commons.datastore.mongodb.MongoDBCollection.MULTI;
import static org.opencb.commons.datastore.mongodb.MongoDBCollection.UPSERT;
import static org.opencb.opencga.storage.core.variant.VariantStorageEngine.Options.DEFAULT_TIMEOUT;
import static org.opencb.opencga.storage.core.variant.VariantStorageEngine.Options.MAX_TIMEOUT;
import static org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptorUtils.*;
import static org.opencb.opencga.storage.mongodb.variant.MongoDBVariantStorageEngine.MongoDBVariantOptions.COLLECTION_STAGE;
import static org.opencb.opencga.storage.mongodb.variant.MongoDBVariantStorageEngine.MongoDBVariantOptions.DEFAULT_GENOTYPE;
/**
* @author Ignacio Medina <igmecas@gmail.com>
* @author Jacobo Coll <jacobo167@gmail.com>
* @author Cristina Yenyxe Gonzalez Garcia <cyenyxe@ebi.ac.uk>
*/
public class VariantMongoDBAdaptor implements VariantDBAdaptor {
private final CellBaseClient cellBaseClient;
private boolean closeConnection;
private final MongoDataStoreManager mongoManager;
private final MongoDataStore db;
private final String collectionName;
private final MongoDBCollection variantsCollection;
private final VariantSourceMongoDBAdaptor variantSourceMongoDBAdaptor;
private final StorageConfiguration storageConfiguration;
@Deprecated
private final StorageEngineConfiguration storageEngineConfiguration;
private final Pattern writeResultErrorPattern = Pattern.compile("^.*dup key: \\{ : \"([^\"]*)\" \\}$");
private final VariantDBAdaptorUtils utils;
private final MongoCredentials credentials;
private static final Pattern OPERATION_PATTERN = Pattern.compile("^([^=<>~!]*)(<=?|>=?|!=|!?=?~|==?)([^=<>~!]+.*)$");
private StudyConfigurationManager studyConfigurationManager;
private final ObjectMap configuration;
private final CellBaseConfiguration cellbaseConfiguration;
private CacheManager cacheManager;
private VariantSearchManager variantSearchManager;
@Deprecated
private DataWriter dataWriter;
protected static Logger logger = LoggerFactory.getLogger(VariantMongoDBAdaptor.class);
// Number of opened dbAdaptors
public static final AtomicInteger NUMBER_INSTANCES = new AtomicInteger(0);
public VariantMongoDBAdaptor(MongoCredentials credentials, String variantsCollectionName, String filesCollectionName,
StudyConfigurationManager studyConfigurationManager, StorageConfiguration storageConfiguration)
throws UnknownHostException {
this(new MongoDataStoreManager(credentials.getDataStoreServerAddresses()), credentials, variantsCollectionName, filesCollectionName,
studyConfigurationManager, storageConfiguration);
this.closeConnection = true;
}
public VariantMongoDBAdaptor(MongoDataStoreManager mongoManager, MongoCredentials credentials,
String variantsCollectionName, String filesCollectionName,
StudyConfigurationManager studyConfigurationManager, StorageConfiguration storageConfiguration)
throws UnknownHostException {
// MongoDB configuration
this.closeConnection = false;
this.credentials = credentials;
this.mongoManager = mongoManager;
db = mongoManager.get(credentials.getMongoDbName(), credentials.getMongoDBConfiguration());
variantSourceMongoDBAdaptor = new VariantSourceMongoDBAdaptor(db, filesCollectionName);
collectionName = variantsCollectionName;
variantsCollection = db.getCollection(collectionName);
this.studyConfigurationManager = studyConfigurationManager;
cellbaseConfiguration = storageConfiguration.getCellbase();
this.storageConfiguration = storageConfiguration;
this.storageEngineConfiguration = storageConfiguration.getStorageEngine(MongoDBVariantStorageEngine.STORAGE_ENGINE_ID);
this.configuration = storageEngineConfiguration == null || this.storageEngineConfiguration.getVariant().getOptions() == null
? new ObjectMap()
: this.storageEngineConfiguration.getVariant().getOptions();
this.utils = new VariantDBAdaptorUtils(this);
String species = configuration.getString(VariantAnnotationManager.SPECIES);
String assembly = configuration.getString(VariantAnnotationManager.ASSEMBLY);
ClientConfiguration clientConfiguration = cellbaseConfiguration.toClientConfiguration();
if (StringUtils.isEmpty(species)) {
species = clientConfiguration.getDefaultSpecies();
}
cellBaseClient = new CellBaseClient(AbstractCellBaseVariantAnnotator.toCellBaseSpeciesName(species), clientConfiguration);
this.cacheManager = new CacheManager(storageConfiguration);
this.variantSearchManager = new VariantSearchManager(utils, storageConfiguration);
NUMBER_INSTANCES.incrementAndGet();
}
public MongoDBCollection getVariantsCollection() {
return variantsCollection;
}
public MongoDBCollection getStageCollection() {
return db.getCollection(configuration.getString(COLLECTION_STAGE.key(), COLLECTION_STAGE.defaultValue()));
}
protected MongoDataStore getDB() {
return db;
}
protected MongoCredentials getCredentials() {
return credentials;
}
@Override
@Deprecated
public void setDataWriter(DataWriter dataWriter) {
this.dataWriter = dataWriter;
}
@Override
public QueryResult insert(List<Variant> variants, String studyName, QueryOptions options) {
StudyConfiguration studyConfiguration = studyConfigurationManager.getStudyConfiguration(studyName, options).first();
// TODO FILE_ID must be in QueryOptions?
int fileId = options.getInt(VariantStorageEngine.Options.FILE_ID.key());
boolean includeStats = options.getBoolean(VariantStorageEngine.Options.INCLUDE_STATS.key(), VariantStorageEngine.Options
.INCLUDE_STATS.defaultValue());
// boolean includeSrc = options.getBoolean(VariantStorageEngine.Options.INCLUDE_SRC.key(), VariantStorageEngine.Options
// .INCLUDE_SRC.defaultValue());
// boolean includeGenotypes = options.getBoolean(VariantStorageEngine.Options.INCLUDE_GENOTYPES.key(), VariantStorageEngine
// .Options.INCLUDE_GENOTYPES.defaultValue());
// boolean compressGenotypes = options.getBoolean(VariantStorageEngine.Options.COMPRESS_GENOTYPES.key(), VariantStorageEngine
// .Options.COMPRESS_GENOTYPES.defaultValue());
// String defaultGenotype = options.getString(MongoDBVariantStorageEngine.DEFAULT_GENOTYPE, "0|0");
DocumentToVariantConverter variantConverter = new DocumentToVariantConverter(null, includeStats ? new
DocumentToVariantStatsConverter(studyConfigurationManager) : null);
// DBObjectToStudyVariantEntryConverter sourceEntryConverter = new DBObjectToStudyVariantEntryConverter(includeSrc,
// includeGenotypes ? new DBObjectToSamplesConverter(studyConfiguration) : null);
DocumentToStudyVariantEntryConverter sourceEntryConverter =
new DocumentToStudyVariantEntryConverter(true, new DocumentToSamplesConverter(studyConfiguration));
return insert(variants, fileId, variantConverter, sourceEntryConverter, studyConfiguration, getLoadedSamples(fileId,
studyConfiguration));
}
@Override
public QueryResult delete(Query query, QueryOptions options) {
Bson mongoQuery = parseQuery(query);
logger.debug("Delete to be executed: '{}'", mongoQuery.toString());
QueryResult queryResult = variantsCollection.remove(mongoQuery, options);
return queryResult;
}
@Override
public QueryResult deleteSamples(String studyName, List<String> sampleNames, QueryOptions options) {
//TODO
throw new UnsupportedOperationException();
}
@Override
public QueryResult deleteFile(String studyName, String fileName, QueryOptions options) {
//TODO
throw new UnsupportedOperationException();
}
@Override
public QueryResult deleteStudy(String studyName, QueryOptions options) {
if (options == null) {
options = new QueryOptions();
}
StudyConfiguration studyConfiguration = studyConfigurationManager.getStudyConfiguration(studyName, options).first();
Document query = parseQuery(new Query(VariantQueryParams.STUDIES.key(), studyConfiguration.getStudyId()));
// { $pull : { files : { sid : <studyId> } } }
Document update = new Document(
"$pull",
new Document(
DocumentToVariantConverter.STUDIES_FIELD,
new Document(
DocumentToStudyVariantEntryConverter.STUDYID_FIELD, studyConfiguration.getStudyId()
)
)
);
QueryResult<UpdateResult> result = variantsCollection.update(query, update, new QueryOptions(MULTI, true));
logger.debug("deleteStudy: query = {}", query);
logger.debug("deleteStudy: update = {}", update);
if (options.getBoolean("purge", false)) {
Document purgeQuery = new Document(DocumentToVariantConverter.STUDIES_FIELD, new Document("$size", 0));
variantsCollection.remove(purgeQuery, new QueryOptions(MULTI, true));
}
return result;
}
@Override
public VariantQueryResult<Variant> get(Query query, QueryOptions options) {
if (options == null) {
options = new QueryOptions();
}
logger.info("******************** Summary => " + options.getBoolean("summary"));
VariantQueryResult<Variant> queryResult;
if (options.getBoolean("cache") && cacheManager.isTypeAllowed("var")) {
List<Integer> studyIds = utils.getStudyIds(query.getAsList(VariantQueryParams.STUDIES.key()), options);
// TODO : ONLY USING ONE STUDY ID ?
String key = cacheManager.createKey(studyIds.get(0).toString(), "var", query, options);
queryResult = new VariantQueryResult<>(cacheManager.get(key), null);
if (queryResult.getResult() == null || queryResult.getResult().size() == 0) {
queryResult = getVariantQueryResult(query, options);
cacheManager.set(key, query, queryResult);
}
} else {
if (options.getBoolean("summary", false) && storageConfiguration.getSearch().getActive()
&& variantSearchManager != null && variantSearchManager.isAlive(credentials.getMongoDbName())) {
try {
queryResult = variantSearchManager.query(credentials.getMongoDbName(), query, options);
} catch (IOException | VariantSearchException e) {
throw Throwables.propagate(e);
}
} else {
queryResult = getVariantQueryResult(query, options);
}
}
return queryResult;
}
private VariantQueryResult<Variant> getVariantQueryResult(Query query, QueryOptions options) {
Document mongoQuery = parseQuery(query);
Document projection = createProjection(query, options);
// logger.debug("Query to be executed: '{}'", mongoQuery.toJson(new JsonWriterSettings(JsonMode.SHELL, false)));
options.putIfAbsent(QueryOptions.SKIP_COUNT, true);
int defaultTimeout = configuration.getInt(DEFAULT_TIMEOUT.key(), DEFAULT_TIMEOUT.defaultValue());
int maxTimeout = configuration.getInt(MAX_TIMEOUT.key(), MAX_TIMEOUT.defaultValue());
int timeout = options.getInt(QueryOptions.TIMEOUT, defaultTimeout);
if (timeout > maxTimeout || timeout < 0) {
timeout = maxTimeout;
}
options.put(QueryOptions.TIMEOUT, timeout);
// FIXME: MONGO_MIGRATION
// if (options.getBoolean("mongodb.explain", false)) {
// FindIterable<Document> dbCursor = variantsCollection.nativeQuery().find(mongoQuery, projection, options);
// DBObject explain = dbCursor.explain();
// try {
// System.err.println("mongodb.explain = "
// + new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(explain));
// } catch (JsonProcessingException ignore) {
// System.err.println("mongodb.explain = " + explain);
// }
// }
DocumentToVariantConverter converter = getDocumentToVariantConverter(query, options);
Map<String, List<String>> samples = getDBAdaptorUtils().getSamplesMetadata(query, options);
return new VariantQueryResult<>(variantsCollection.find(mongoQuery, projection, converter, options), samples);
}
@Override
public List<VariantQueryResult<Variant>> get(List<Query> queries, QueryOptions options) {
List<VariantQueryResult<Variant>> queryResultList = new ArrayList<>(queries.size());
for (Query query : queries) {
VariantQueryResult<Variant> queryResult = get(query, options);
queryResultList.add(queryResult);
}
return queryResultList;
}
@Override
public VariantQueryResult<Variant> getPhased(String varStr, String studyName, String sampleName, QueryOptions options,
int windowsSize) {
StopWatch watch = new StopWatch();
watch.start();
Variant variant = new Variant(varStr);
Region region = new Region(variant.getChromosome(), variant.getStart(), variant.getEnd());
Query query = new Query(VariantQueryParams.REGION.key(), region)
.append(VariantQueryParams.REFERENCE.key(), variant.getReference())
.append(VariantQueryParams.ALTERNATE.key(), variant.getAlternate())
.append(VariantQueryParams.STUDIES.key(), studyName)
.append(VariantQueryParams.RETURNED_STUDIES.key(), studyName)
.append(VariantQueryParams.RETURNED_SAMPLES.key(), sampleName);
VariantQueryResult<Variant> queryResult = get(query, new QueryOptions());
variant = queryResult.first();
if (variant != null && !variant.getStudies().isEmpty()) {
StudyEntry studyEntry = variant.getStudies().get(0);
Integer psIdx = studyEntry.getFormatPositions().get(VCFConstants.PHASE_SET_KEY);
if (psIdx != null) {
String ps = studyEntry.getSamplesData().get(0).get(psIdx);
if (!ps.equals(DocumentToSamplesConverter.UNKNOWN_FIELD)) {
sampleName = studyEntry.getOrderedSamplesName().get(0);
region.setStart(region.getStart() > windowsSize ? region.getStart() - windowsSize : 0);
region.setEnd(region.getEnd() + windowsSize);
query.remove(VariantQueryParams.REFERENCE.key());
query.remove(VariantQueryParams.ALTERNATE.key());
query.remove(VariantQueryParams.RETURNED_STUDIES.key());
query.remove(VariantQueryParams.RETURNED_SAMPLES.key());
queryResult = get(query, new QueryOptions(QueryOptions.SORT, true));
Iterator<Variant> iterator = queryResult.getResult().iterator();
while (iterator.hasNext()) {
Variant next = iterator.next();
if (!next.getStudies().isEmpty()) {
if (!ps.equals(next.getStudies().get(0).getSampleData(sampleName, VCFConstants.PHASE_SET_KEY))) {
iterator.remove();
}
}
}
queryResult.setNumResults(queryResult.getResult().size());
queryResult.setNumTotalResults(queryResult.getResult().size());
watch.stop();
queryResult.setDbTime(((int) watch.getTime()));
queryResult.setId("getPhased");
queryResult.setSamples(getDBAdaptorUtils().getSamplesMetadata(query, options));
return queryResult;
}
}
}
watch.stop();
return new VariantQueryResult<>("getPhased", ((int) watch.getTime()), 0, 0, null, null, Collections.emptyList(), null);
}
@Override
public QueryResult<Long> count(Query query) {
Document mongoQuery = parseQuery(query);
return variantsCollection.count(mongoQuery);
}
@Override
public QueryResult distinct(Query query, String field) {
String documentPath;
switch (field) {
case "gene":
case "ensemblGene":
documentPath = DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_ENSEMBL_GENE_ID_FIELD;
break;
case "ensemblTranscript":
documentPath = DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_ENSEMBL_TRANSCRIPT_ID_FIELD;
break;
case "ct":
case "consequence_type":
documentPath = DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_SO_ACCESSION_FIELD;
break;
default:
documentPath = DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_GENE_NAME_FIELD;
break;
}
Document mongoQuery = parseQuery(query);
return variantsCollection.distinct(documentPath, mongoQuery);
}
@Override
public VariantDBIterator iterator() {
return iterator(new Query(), new QueryOptions());
}
@Override
public VariantDBIterator iterator(Query query, QueryOptions options) {
if (options == null) {
options = new QueryOptions();
}
if (query == null) {
query = new Query();
}
if (options.getBoolean("summary", false) && storageConfiguration.getSearch().getActive()
&& variantSearchManager != null && variantSearchManager.isAlive(credentials.getMongoDbName())) {
// Solr iterator
try {
return variantSearchManager.iterator(credentials.getMongoDbName(), query, options);
} catch (VariantSearchException | IOException e) {
e.printStackTrace();
}
//throw new UnsupportedOperationException("Summary option (i.e., Solr search) not implemented yet!!");
} else {
Document mongoQuery = parseQuery(query);
Document projection = createProjection(query, options);
DocumentToVariantConverter converter = getDocumentToVariantConverter(query, options);
options.putIfAbsent(MongoDBCollection.BATCH_SIZE, 100);
// Short unsorted queries with timeout or limit don't need the persistent cursor.
if (options.containsKey(QueryOptions.TIMEOUT)
|| options.containsKey(QueryOptions.LIMIT)
|| !options.containsKey(QueryOptions.SORT)) {
FindIterable<Document> dbCursor = variantsCollection.nativeQuery().find(mongoQuery, projection, options);
return new VariantMongoDBIterator(dbCursor, converter);
} else {
return VariantMongoDBIterator.persistentIterator(variantsCollection, mongoQuery, projection, options, converter);
}
}
return null;
}
@Override
public QueryResult getFrequency(Query query, Region region, int regionIntervalSize) {
// db.variants.aggregate( { $match: { $and: [ {chr: "1"}, {start: {$gt: 251391, $lt: 2701391}} ] }},
// { $group: { _id: { $subtract: [ { $divide: ["$start", 20000] }, { $divide: [{$mod: ["$start", 20000]},
// 20000] } ] },
// totalCount: {$sum: 1}}})
QueryOptions options = new QueryOptions();
// If interval is not provided is set to the value that returns 200 values
if (regionIntervalSize <= 0) {
// regionIntervalSize = options.getInt("interval", (region.getEnd() - region.getStart()) / 200);
regionIntervalSize = (region.getEnd() - region.getStart()) / 200;
}
Document start = new Document("$gt", region.getStart());
start.append("$lt", region.getEnd());
BasicDBList andArr = new BasicDBList();
andArr.add(new Document(DocumentToVariantConverter.CHROMOSOME_FIELD, region.getChromosome()));
andArr.add(new Document(DocumentToVariantConverter.START_FIELD, start));
// Parsing the rest of options
Document mongoQuery = parseQuery(query);
if (!mongoQuery.isEmpty()) {
andArr.add(mongoQuery);
}
Document match = new Document("$match", new Document("$and", andArr));
// qb.and("_at.chunkIds").in(chunkIds);
// qb.and(DBObjectToVariantConverter.END_FIELD).greaterThanEquals(region.getStart());
// qb.and(DBObjectToVariantConverter.START_FIELD).lessThanEquals(region.getEnd());
//
// List<String> chunkIds = getChunkIds(region);
// DBObject regionObject = new Document("_at.chunkIds", new Document("$in", chunkIds))
// .append(DBObjectToVariantConverter.END_FIELD, new Document("$gte", region.getStart()))
// .append(DBObjectToVariantConverter.START_FIELD, new Document("$lte", region.getEnd()));
BasicDBList divide1 = new BasicDBList();
divide1.add("$start");
divide1.add(regionIntervalSize);
BasicDBList divide2 = new BasicDBList();
divide2.add(new Document("$mod", divide1));
divide2.add(regionIntervalSize);
BasicDBList subtractList = new BasicDBList();
subtractList.add(new Document("$divide", divide1));
subtractList.add(new Document("$divide", divide2));
Document subtract = new Document("$subtract", subtractList);
Document totalCount = new Document("$sum", 1);
Document g = new Document("_id", subtract);
g.append("features_count", totalCount);
Document group = new Document("$group", g);
Document sort = new Document("$sort", new Document("_id", 1));
// logger.info("getAllIntervalFrequencies - (>·_·)>");
// System.out.println(options.toString());
// System.out.println(match.toString());
// System.out.println(group.toString());
// System.out.println(sort.toString());
long dbTimeStart = System.currentTimeMillis();
QueryResult output = variantsCollection.aggregate(/*"$histogram", */Arrays.asList(match, group, sort), options);
long dbTimeEnd = System.currentTimeMillis();
Map<Long, Document> ids = new HashMap<>();
// Create DBObject for intervals with features inside them
for (Document intervalObj : (List<Document>) output.getResult()) {
Long auxId = Math.round((Double) intervalObj.get("_id")); //is double
Document intervalVisited = ids.get(auxId);
if (intervalVisited == null) {
intervalObj.put("_id", auxId);
intervalObj.put("start", getChunkStart(auxId.intValue(), regionIntervalSize));
intervalObj.put("end", getChunkEnd(auxId.intValue(), regionIntervalSize));
intervalObj.put("chromosome", region.getChromosome());
intervalObj.put("features_count", Math.log((int) intervalObj.get("features_count")));
ids.put(auxId, intervalObj);
} else {
Double sum = (Double) intervalVisited.get("features_count") + Math.log((int) intervalObj.get("features_count"));
intervalVisited.put("features_count", sum.intValue());
}
}
// Create DBObject for intervals without features inside them
BasicDBList resultList = new BasicDBList();
int firstChunkId = getChunkId(region.getStart(), regionIntervalSize);
int lastChunkId = getChunkId(region.getEnd(), regionIntervalSize);
Document intervalObj;
for (int chunkId = firstChunkId; chunkId <= lastChunkId; chunkId++) {
intervalObj = ids.get((long) chunkId);
if (intervalObj == null) {
intervalObj = new Document();
intervalObj.put("_id", chunkId);
intervalObj.put("start", getChunkStart(chunkId, regionIntervalSize));
intervalObj.put("end", getChunkEnd(chunkId, regionIntervalSize));
intervalObj.put("chromosome", region.getChromosome());
intervalObj.put("features_count", 0);
}
resultList.add(intervalObj);
}
QueryResult queryResult = new QueryResult(region.toString(), ((Long) (dbTimeEnd - dbTimeStart)).intValue(),
resultList.size(), resultList.size(), null, null, resultList);
return queryResult;
}
@Override
public QueryResult rank(Query query, String field, int numResults, boolean asc) {
QueryOptions options = new QueryOptions();
options.put("limit", numResults);
options.put("count", true);
options.put("order", (asc) ? 1 : -1); // MongoDB: 1 = ascending, -1 = descending
return groupBy(query, field, options);
}
@Override
public QueryResult groupBy(Query query, String field, QueryOptions options) {
String documentPath;
String unwindPath;
int numUnwinds = 2;
switch (field) {
case "gene":
case "ensemblGene":
documentPath = DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_ENSEMBL_GENE_ID_FIELD;
unwindPath = DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD;
break;
case "ct":
case "consequence_type":
documentPath = DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_SO_ACCESSION_FIELD;
unwindPath = DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD;
numUnwinds = 3;
break;
default:
documentPath = DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_GENE_NAME_FIELD;
unwindPath = DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD;
break;
}
Document mongoQuery = parseQuery(query);
if (options == null) {
options = new QueryOptions();
} else {
options = new QueryOptions(options); // Copy given QueryOptions.
}
boolean count = options.getBoolean("count", false);
int order = options.getInt("order", -1);
Document project;
Document projectAndCount;
if (count) {
project = new Document("$project", new Document("field", "$" + documentPath));
projectAndCount = new Document("$project", new Document()
.append("id", "$_id")
.append("_id", 0)
.append("count", new Document("$size", "$values")));
} else {
project = new Document("$project", new Document()
.append("field", "$" + documentPath)
//.append("_id._id", "$_id")
.append("_id.start", "$" + DocumentToVariantConverter.START_FIELD)
.append("_id.end", "$" + DocumentToVariantConverter.END_FIELD)
.append("_id.chromosome", "$" + DocumentToVariantConverter.CHROMOSOME_FIELD)
.append("_id.alternate", "$" + DocumentToVariantConverter.ALTERNATE_FIELD)
.append("_id.reference", "$" + DocumentToVariantConverter.REFERENCE_FIELD)
.append("_id.ids", "$" + DocumentToVariantConverter.IDS_FIELD));
projectAndCount = new Document("$project", new Document()
.append("id", "$_id")
.append("_id", 0)
.append("values", "$values")
.append("count", new Document("$size", "$values")));
}
Document match = new Document("$match", mongoQuery);
Document unwindField = new Document("$unwind", "$field");
Document notNull = new Document("$match", new Document("field", new Document("$ne", null)));
Document groupAndAddToSet = new Document("$group", new Document("_id", "$field")
.append("values", new Document("$addToSet", "$_id"))); // sum, count, avg, ...?
Document sort = new Document("$sort", new Document("count", order)); // 1 = ascending, -1 = descending
int skip = options.getInt(QueryOptions.SKIP, -1);
Document skipStep = skip > 0 ? new Document("$skip", skip) : null;
int limit = options.getInt(QueryOptions.LIMIT, -1) > 0 ? options.getInt(QueryOptions.LIMIT) : 10;
options.remove(QueryOptions.LIMIT); // Remove limit or Datastore will add a new limit step
Document limitStep = new Document("$limit", limit);
List<Bson> operations = new LinkedList<>();
operations.add(match);
operations.add(project);
for (int i = 0; i < numUnwinds; i++) {
operations.add(unwindField);
}
operations.add(notNull);
operations.add(groupAndAddToSet);
operations.add(projectAndCount);
operations.add(sort);
if (skipStep != null) {
operations.add(skipStep);
}
operations.add(limitStep);
logger.debug("db." + collectionName + ".aggregate( " + operations + " )");
QueryResult<Document> queryResult = variantsCollection.aggregate(operations, options);
// List<Map<String, Object>> results = new ArrayList<>(queryResult.getResult().size());
// results.addAll(queryResult.getResult().stream().map(dbObject -> new ObjectMap("id", dbObject.get("_id")).append("count",
// dbObject.get("count"))).collect(Collectors.toList()));
return queryResult;
}
@Override
public QueryResult groupBy(Query query, List<String> fields, QueryOptions options) {
String warningMsg = "Unimplemented VariantMongoDBAdaptor::groupBy list of fields. Using field[0] : '" + fields.get(0) + "'";
logger.warn(warningMsg);
QueryResult queryResult = groupBy(query, fields.get(0), options);
queryResult.setWarningMsg(warningMsg);
return queryResult;
}
@Override
public QueryResult updateStats(List<VariantStatsWrapper> variantStatsWrappers, String studyName, QueryOptions options) {
return updateStats(variantStatsWrappers, studyConfigurationManager.getStudyConfiguration(studyName, options).first(), options);
}
@Override
public QueryResult updateStats(List<VariantStatsWrapper> variantStatsWrappers, StudyConfiguration studyConfiguration,
QueryOptions options) {
// MongoCollection<Document> coll = db.getDb().getCollection(collectionName);
// BulkWriteOperation pullBuilder = coll.initializeUnorderedBulkOperation();
// BulkWriteOperation pushBuilder = coll.initializeUnorderedBulkOperation();
List<Bson> pullQueriesBulkList = new LinkedList<>();
List<Bson> pullUpdatesBulkList = new LinkedList<>();
List<Bson> pushQueriesBulkList = new LinkedList<>();
List<Bson> pushUpdatesBulkList = new LinkedList<>();
long start = System.nanoTime();
DocumentToVariantStatsConverter statsConverter = new DocumentToVariantStatsConverter(studyConfigurationManager);
// VariantSource variantSource = queryOptions.get(VariantStorageEngine.VARIANT_SOURCE, VariantSource.class);
DocumentToVariantConverter variantConverter = getDocumentToVariantConverter(new Query(), options);
boolean overwrite = options.getBoolean(VariantStorageEngine.Options.OVERWRITE_STATS.key(), false);
//TODO: Use the StudyConfiguration to change names to ids
// TODO make unset of 'st' if already present?
for (VariantStatsWrapper wrapper : variantStatsWrappers) {
Map<String, VariantStats> cohortStats = wrapper.getCohortStats();
Iterator<VariantStats> iterator = cohortStats.values().iterator();
VariantStats variantStats = iterator.hasNext() ? iterator.next() : null;
List<Document> cohorts = statsConverter.convertCohortsToStorageType(cohortStats, studyConfiguration.getStudyId()); // TODO
// remove when we remove fileId
// List cohorts = statsConverter.convertCohortsToStorageType(cohortStats, variantSource.getStudyId()); // TODO use when we
// remove fileId
// add cohorts, overwriting old values if that cid, fid and sid already exists: remove and then add
// db.variants.update(
// {_id:<id>},
// {$pull:{st:{cid:{$in:["Cohort 1","cohort 2"]}, fid:{$in:["file 1", "file 2"]}, sid:{$in:["study 1", "study 2"]}}}}
// )
// db.variants.update(
// {_id:<id>},
// {$push:{st:{$each: [{cid:"Cohort 1", fid:"file 1", ... , defaultValue:3},{cid:"Cohort 2", ... , defaultValue:3}] }}}
// )
if (!cohorts.isEmpty()) {
String id = variantConverter.buildStorageId(wrapper.getChromosome(), wrapper.getPosition(),
variantStats.getRefAllele(), variantStats.getAltAllele());
Document find = new Document("_id", id);
if (overwrite) {
List<Document> idsList = new ArrayList<>(cohorts.size());
for (Document cohort : cohorts) {
Document ids = new Document()
.append(DocumentToVariantStatsConverter.COHORT_ID, cohort.get(DocumentToVariantStatsConverter.COHORT_ID))
.append(DocumentToVariantStatsConverter.STUDY_ID, cohort.get(DocumentToVariantStatsConverter.STUDY_ID));
idsList.add(ids);
}
Document pull = new Document("$pull",
new Document(DocumentToVariantConverter.STATS_FIELD,
new Document("$or", idsList)));
pullQueriesBulkList.add(find);
pullUpdatesBulkList.add(pull);
}
Document push = new Document("$push",
new Document(DocumentToVariantConverter.STATS_FIELD,
new Document("$each", cohorts)));
pushQueriesBulkList.add(find);
pushUpdatesBulkList.add(push);
}
}
// TODO handle if the variant didn't had that studyId in the files array
// TODO check the substitution is done right if the stats are already present
if (overwrite) {
variantsCollection.update(pullQueriesBulkList, pullUpdatesBulkList, new QueryOptions());
}
BulkWriteResult writeResult = variantsCollection.update(pushQueriesBulkList, pushUpdatesBulkList, new QueryOptions()).first();
int writes = writeResult.getModifiedCount();
return new QueryResult<>("", ((int) (System.nanoTime() - start)), writes, writes, "", "", Collections.singletonList(writeResult));
}
@Override
public QueryResult deleteStats(String studyName, String cohortName, QueryOptions options) {
StudyConfiguration studyConfiguration = studyConfigurationManager.getStudyConfiguration(studyName, options).first();
int cohortId = studyConfiguration.getCohortIds().get(cohortName);
// { st : { $elemMatch : { sid : <studyId>, cid : <cohortId> } } }
Document query = new Document(DocumentToVariantConverter.STATS_FIELD,
new Document("$elemMatch",
new Document(DocumentToVariantStatsConverter.STUDY_ID, studyConfiguration.getStudyId())
.append(DocumentToVariantStatsConverter.COHORT_ID, cohortId)));
// { $pull : { st : { sid : <studyId>, cid : <cohortId> } } }
Document update = new Document(
"$pull",
new Document(DocumentToVariantConverter.STATS_FIELD,
new Document(DocumentToVariantStatsConverter.STUDY_ID, studyConfiguration.getStudyId())
.append(DocumentToVariantStatsConverter.COHORT_ID, cohortId)
)
);
logger.debug("deleteStats: query = {}", query);
logger.debug("deleteStats: update = {}", update);
return variantsCollection.update(query, update, new QueryOptions(MULTI, true));
}
@Override
public QueryResult addAnnotations(List<VariantAnnotation> variantAnnotations, QueryOptions queryOptions) {
logger.warn("Unimplemented VariantMongoDBAdaptor::addAnnotations. Using \"VariantMongoDBAdaptor::updateAnnotations\"");
return updateAnnotations(variantAnnotations, queryOptions);
}
@Override
public QueryResult updateAnnotations(List<VariantAnnotation> variantAnnotations, QueryOptions queryOptions) {
List<Bson> queries = new LinkedList<>();
List<Bson> updates = new LinkedList<>();
long start = System.nanoTime();
DocumentToVariantConverter variantConverter = getDocumentToVariantConverter(new Query(), queryOptions);
for (VariantAnnotation variantAnnotation : variantAnnotations) {
String id = variantConverter.buildStorageId(variantAnnotation.getChromosome(), variantAnnotation.getStart(),
variantAnnotation.getReference(), variantAnnotation.getAlternate());
Document find = new Document("_id", id);
DocumentToVariantAnnotationConverter converter = new DocumentToVariantAnnotationConverter();
Document convertedVariantAnnotation = converter.convertToStorageType(variantAnnotation);
Document update = new Document("$set", new Document(DocumentToVariantConverter.ANNOTATION_FIELD + ".0",
convertedVariantAnnotation));
queries.add(find);
updates.add(update);
}
BulkWriteResult writeResult = variantsCollection.update(queries, updates, null).first();
return new QueryResult<>("", ((int) (System.nanoTime() - start)), 1, 1, "", "", Collections.singletonList(writeResult));
}
@Override
public QueryResult updateCustomAnnotations(Query query, String name, AdditionalAttribute attribute, QueryOptions options) {
Document queryDocument = parseQuery(query);
Document updateDocument = DocumentToVariantAnnotationConverter.convertToStorageType(attribute);
return variantsCollection.update(queryDocument,
Updates.set(DocumentToVariantConverter.CUSTOM_ANNOTATION_FIELD + "." + name, updateDocument),
new QueryOptions(MULTI, true));
}
@Override
public QueryResult deleteAnnotation(String annotationId, Query query, QueryOptions queryOptions) {
Document mongoQuery = parseQuery(query);
logger.debug("deleteAnnotation: query = {}", mongoQuery);
Document update = new Document("$set", new Document(DocumentToVariantConverter.ANNOTATION_FIELD + ".0", null));
logger.debug("deleteAnnotation: update = {}", update);
return variantsCollection.update(mongoQuery, update, new QueryOptions(MULTI, true));
}
@Override
public void close() throws IOException {
if (closeConnection) {
mongoManager.close();
}
studyConfigurationManager.close();
cacheManager.close();
NUMBER_INSTANCES.decrementAndGet();
}
private Document parseQuery(final Query originalQuery) {
QueryBuilder builder = new QueryBuilder();
if (originalQuery != null) {
// Copy given query. It may be modified
Query query = new Query(originalQuery);
boolean nonGeneRegionFilter = false;
/* VARIANT PARAMS */
List<Region> regions = new ArrayList<>();
if (isValidParam(query, VariantQueryParams.CHROMOSOME)) {
nonGeneRegionFilter = true;
regions.addAll(Region.parseRegions(query.getString(VariantQueryParams.CHROMOSOME.key()), true));
}
if (isValidParam(query, VariantQueryParams.REGION)) {
nonGeneRegionFilter = true;
regions.addAll(Region.parseRegions(query.getString(VariantQueryParams.REGION.key()), true));
}
if (!regions.isEmpty()) {
getRegionFilter(regions, builder);
}
// List with all MongoIds from ID and XREF filters
List<String> mongoIds = new ArrayList<>();
if (isValidParam(query, VariantQueryParams.ID)) {
nonGeneRegionFilter = true;
List<String> idsList = query.getAsStringList(VariantQueryParams.ID.key());
List<String> otherIds = new ArrayList<>(idsList.size());
for (String value : idsList) {
Variant variant = toVariant(value);
if (variant != null) {
mongoIds.add(MongoDBVariantStageLoader.STRING_ID_CONVERTER.buildId(variant));
} else {
otherIds.add(value);
}
}
if (!otherIds.isEmpty()) {
String ids = otherIds.stream().collect(Collectors.joining(","));
addQueryStringFilter(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.XREFS_FIELD
+ "." + DocumentToVariantAnnotationConverter.XREF_ID_FIELD, ids, builder, QueryOperation.OR);
addQueryStringFilter(DocumentToVariantConverter.IDS_FIELD, ids, builder, QueryOperation.OR);
}
}
List<String> genes = new ArrayList<>(query.getAsStringList(VariantQueryParams.GENE.key()));
if (isValidParam(query, VariantQueryParams.ANNOT_XREF)) {
List<String> xrefs = query.getAsStringList(VariantQueryParams.ANNOT_XREF.key());
List<String> otherXrefs = new ArrayList<>();
for (String value : xrefs) {
Variant variant = toVariant(value);
if (variant != null) {
mongoIds.add(MongoDBVariantStageLoader.STRING_ID_CONVERTER.buildId(variant));
} else {
if (isVariantAccession(value) || isClinicalAccession(value) || isGeneAccession(value)) {
otherXrefs.add(value);
} else {
genes.add(value);
}
}
}
if (!otherXrefs.isEmpty()) {
nonGeneRegionFilter = true;
addQueryStringFilter(DocumentToVariantConverter.ANNOTATION_FIELD
+ '.' + DocumentToVariantAnnotationConverter.XREFS_FIELD
+ '.' + DocumentToVariantAnnotationConverter.XREF_ID_FIELD,
String.join(",", otherXrefs), builder, QueryOperation.OR);
}
}
if (!genes.isEmpty()) {
if (isValidParam(query, VariantQueryParams.ANNOT_CONSEQUENCE_TYPE)) {
List<String> soList = query.getAsStringList(VariantQueryParams.ANNOT_CONSEQUENCE_TYPE.key());
Set<String> gnSo = new HashSet<>(genes.size() * soList.size());
for (String gene : genes) {
for (String so : soList) {
int soNumber = parseConsequenceType(so);
gnSo.add(DocumentToVariantAnnotationConverter.buildGeneSO(gene, soNumber));
}
}
builder.or(new BasicDBObject(DocumentToVariantConverter.ANNOTATION_FIELD
+ '.' + DocumentToVariantAnnotationConverter.GENE_SO_FIELD, new BasicDBObject("$in", gnSo)));
if (!nonGeneRegionFilter) {
// Filter already present in the GENE_SO_FIELD
query.remove(VariantQueryParams.ANNOT_CONSEQUENCE_TYPE.key());
}
} else {
addQueryStringFilter(DocumentToVariantConverter.ANNOTATION_FIELD
+ '.' + DocumentToVariantAnnotationConverter.XREFS_FIELD
+ '.' + DocumentToVariantAnnotationConverter.XREF_ID_FIELD,
String.join(",", genes), builder, QueryOperation.OR);
}
}
if (!mongoIds.isEmpty()) {
if (mongoIds.size() == 1) {
builder.or(new QueryBuilder().and("_id").is(mongoIds.get(0)).get());
} else {
builder.or(new QueryBuilder().and("_id").in(mongoIds).get());
}
}
if (isValidParam(query, VariantQueryParams.REFERENCE)) {
addQueryStringFilter(DocumentToVariantConverter.REFERENCE_FIELD, query.getString(VariantQueryParams.REFERENCE.key()),
builder, QueryOperation.AND);
}
if (isValidParam(query, VariantQueryParams.ALTERNATE)) {
addQueryStringFilter(DocumentToVariantConverter.ALTERNATE_FIELD, query.getString(VariantQueryParams.ALTERNATE.key()),
builder, QueryOperation.AND);
}
if (isValidParam(query, VariantQueryParams.TYPE)) {
addQueryFilter(DocumentToVariantConverter.TYPE_FIELD, query.getString(VariantQueryParams.TYPE.key()), builder,
QueryOperation.AND, s -> {
Set<VariantType> subTypes = Variant.subTypes(VariantType.valueOf(s));
List<String> types = new ArrayList<>(subTypes.size() + 1);
types.add(s);
subTypes.forEach(subType -> types.add(subType.toString()));
return types;
}); //addQueryStringFilter(DBObjectToVariantConverter.TYPE_FIELD,
// query.getString(VariantQueryParams.TYPE.key()), builder, QueryOperation.AND);
}
/* ANNOTATION PARAMS */
parseAnnotationQueryParams(query, builder);
/* STUDIES */
final StudyConfiguration defaultStudyConfiguration = parseStudyQueryParams(query, builder);
/* STATS PARAMS */
parseStatsQueryParams(query, builder, defaultStudyConfiguration);
}
logger.debug("Query = {}", originalQuery == null ? "{}" : originalQuery.toJson());
Document mongoQuery = new Document(builder.get().toMap());
logger.debug("MongoDB Query = {}", mongoQuery.toJson(new JsonWriterSettings(JsonMode.SHELL, false)));
return mongoQuery;
}
private void parseAnnotationQueryParams(Query query, QueryBuilder builder) {
if (query != null) {
if (isValidParam(query, VariantQueryParams.ANNOTATION_EXISTS)) {
boolean exists = query.getBoolean(VariantQueryParams.ANNOTATION_EXISTS.key());
builder.and(DocumentToVariantConverter.ANNOTATION_FIELD + "." + DocumentToVariantAnnotationConverter.ANNOT_ID_FIELD);
builder.exists(exists);
if (!exists) {
builder.and(DocumentToVariantConverter.ANNOTATION_FIELD
+ '.' + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ '.' + DocumentToVariantAnnotationConverter.CT_SO_ACCESSION_FIELD)
.exists(false);
}
// else , should be combined with an or, and it would not speed up the filtering. This scenario is not so common
}
if (isValidParam(query, VariantQueryParams.ANNOT_CONSEQUENCE_TYPE)) {
String value = query.getString(VariantQueryParams.ANNOT_CONSEQUENCE_TYPE.key());
addQueryFilter(DocumentToVariantConverter.ANNOTATION_FIELD
+ '.' + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ '.' + DocumentToVariantAnnotationConverter.CT_SO_ACCESSION_FIELD, value, builder, QueryOperation.AND,
VariantDBAdaptorUtils::parseConsequenceType);
}
if (isValidParam(query, VariantQueryParams.ANNOT_BIOTYPE)) {
String biotypes = query.getString(VariantQueryParams.ANNOT_BIOTYPE.key());
addQueryStringFilter(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_BIOTYPE_FIELD, biotypes, builder, QueryOperation.AND);
}
if (isValidParam(query, VariantQueryParams.ANNOT_POLYPHEN)) {
String value = query.getString(VariantQueryParams.ANNOT_POLYPHEN.key());
// addCompListQueryFilter(DocumentToVariantConverter.ANNOTATION_FIELD
// + "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
// + "." + DocumentToVariantAnnotationConverter.CT_PROTEIN_POLYPHEN_FIELD
// + "." + DocumentToVariantAnnotationConverter.SCORE_SCORE_FIELD,
// value, builder);
addScoreFilter(value, builder, VariantQueryParams.ANNOT_POLYPHEN, DocumentToVariantAnnotationConverter.POLYPHEN, true);
}
if (isValidParam(query, VariantQueryParams.ANNOT_SIFT)) {
String value = query.getString(VariantQueryParams.ANNOT_SIFT.key());
// addCompListQueryFilter(DocumentToVariantConverter.ANNOTATION_FIELD
// + "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
// + "." + DocumentToVariantAnnotationConverter.CT_PROTEIN_SIFT_FIELD + "."
// + DocumentToVariantAnnotationConverter.SCORE_SCORE_FIELD, value, builder);
addScoreFilter(value, builder, VariantQueryParams.ANNOT_SIFT, DocumentToVariantAnnotationConverter.SIFT, true);
}
if (isValidParam(query, VariantQueryParams.ANNOT_PROTEIN_SUBSTITUTION)) {
String value = query.getString(VariantQueryParams.ANNOT_PROTEIN_SUBSTITUTION.key());
addScoreFilter(value, builder, VariantQueryParams.ANNOT_PROTEIN_SUBSTITUTION, true);
}
if (isValidParam(query, VariantQueryParams.ANNOT_CONSERVATION)) {
String value = query.getString(VariantQueryParams.ANNOT_CONSERVATION.key());
addScoreFilter(value, builder, VariantQueryParams.ANNOT_CONSERVATION, false);
}
if (isValidParam(query, VariantQueryParams.ANNOT_TRANSCRIPTION_FLAGS)) {
String value = query.getString(VariantQueryParams.ANNOT_TRANSCRIPTION_FLAGS.key());
addQueryStringFilter(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_TRANSCRIPT_ANNOT_FLAGS, value, builder, QueryOperation.AND);
}
// QueryBuilder geneTraitBuilder = QueryBuilder.start();
if (isValidParam(query, VariantQueryParams.ANNOT_GENE_TRAITS_ID)) {
String value = query.getString(VariantQueryParams.ANNOT_GENE_TRAITS_ID.key());
addQueryStringFilter(DocumentToVariantConverter.ANNOTATION_FIELD
+ '.' + DocumentToVariantAnnotationConverter.GENE_TRAIT_FIELD
+ '.' + DocumentToVariantAnnotationConverter.GENE_TRAIT_ID_FIELD, value, builder, QueryOperation.AND);
}
if (isValidParam(query, VariantQueryParams.ANNOT_GENE_TRAITS_NAME)) {
String value = query.getString(VariantQueryParams.ANNOT_GENE_TRAITS_NAME.key());
addCompQueryFilter(DocumentToVariantConverter.ANNOTATION_FIELD
+ '.' + DocumentToVariantAnnotationConverter.GENE_TRAIT_FIELD
+ '.' + DocumentToVariantAnnotationConverter.GENE_TRAIT_NAME_FIELD, value, builder, false);
}
if (isValidParam(query, VariantQueryParams.ANNOT_HPO)) {
String value = query.getString(VariantQueryParams.ANNOT_HPO.key());
// addQueryStringFilter(DocumentToVariantAnnotationConverter.GENE_TRAIT_HPO_FIELD, value, geneTraitBuilder,
// QueryOperation.AND);
addQueryStringFilter(DocumentToVariantConverter.ANNOTATION_FIELD
+ '.' + DocumentToVariantAnnotationConverter.XREFS_FIELD
+ '.' + DocumentToVariantAnnotationConverter.XREF_ID_FIELD, value, builder,
QueryOperation.AND);
}
// DBObject geneTraitQuery = geneTraitBuilder.get();
// if (geneTraitQuery.keySet().size() != 0) {
// builder.and(DocumentToVariantConverter.ANNOTATION_FIELD
// + "." + DocumentToVariantAnnotationConverter.GENE_TRAIT_FIELD).elemMatch(geneTraitQuery);
// }
if (isValidParam(query, VariantQueryParams.ANNOT_GO)) {
String value = query.getString(VariantQueryParams.ANNOT_GO.key());
// Check if comma separated of semi colon separated (AND or OR)
QueryOperation queryOperation = checkOperator(value);
// Split by comma or semi colon
List<String> goValues = splitValue(value, queryOperation);
if (queryOperation == QueryOperation.AND) {
throw VariantQueryException.malformedParam(VariantQueryParams.ANNOT_GO, value, "Unimplemented AND operator");
}
Set<String> genes = utils.getGenesByGo(goValues);
builder.and(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.XREFS_FIELD
+ "." + DocumentToVariantAnnotationConverter.XREF_ID_FIELD).in(genes);
}
if (isValidParam(query, VariantQueryParams.ANNOT_EXPRESSION)) {
String value = query.getString(VariantQueryParams.ANNOT_EXPRESSION.key());
// Check if comma separated of semi colon separated (AND or OR)
QueryOperation queryOperation = checkOperator(value);
// Split by comma or semi colon
List<String> expressionValues = splitValue(value, queryOperation);
if (queryOperation == QueryOperation.AND) {
throw VariantQueryException.malformedParam(VariantQueryParams.ANNOT_EXPRESSION, value, "Unimplemented AND operator");
}
Set<String> genes = utils.getGenesByExpression(expressionValues);
builder.and(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.XREFS_FIELD
+ "." + DocumentToVariantAnnotationConverter.XREF_ID_FIELD).in(genes);
}
if (isValidParam(query, VariantQueryParams.ANNOT_PROTEIN_KEYWORDS)) {
String value = query.getString(VariantQueryParams.ANNOT_PROTEIN_KEYWORDS.key());
addQueryStringFilter(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_PROTEIN_KEYWORDS, value, builder, QueryOperation.AND);
}
if (isValidParam(query, VariantQueryParams.ANNOT_DRUG)) {
String value = query.getString(VariantQueryParams.ANNOT_DRUG.key());
addQueryStringFilter(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.DRUG_FIELD
+ "." + DocumentToVariantAnnotationConverter.DRUG_NAME_FIELD, value, builder, QueryOperation.AND);
}
if (isValidParam(query, VariantQueryParams.ANNOT_FUNCTIONAL_SCORE)) {
String value = query.getString(VariantQueryParams.ANNOT_FUNCTIONAL_SCORE.key());
addScoreFilter(value, builder, VariantQueryParams.ANNOT_FUNCTIONAL_SCORE, false);
}
if (isValidParam(query, VariantQueryParams.ANNOT_CUSTOM)) {
String value = query.getString(VariantQueryParams.ANNOT_CUSTOM.key());
addCompListQueryFilter(DocumentToVariantConverter.CUSTOM_ANNOTATION_FIELD, value, builder, true);
}
if (isValidParam(query, VariantQueryParams.ANNOT_POPULATION_ALTERNATE_FREQUENCY)) {
String value = query.getString(VariantQueryParams.ANNOT_POPULATION_ALTERNATE_FREQUENCY.key());
addFrequencyFilter(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.POPULATION_FREQUENCIES_FIELD,
DocumentToVariantAnnotationConverter.POPULATION_FREQUENCY_ALTERNATE_FREQUENCY_FIELD, value, builder,
VariantQueryParams.ANNOT_POPULATION_ALTERNATE_FREQUENCY); // Same
// method addFrequencyFilter is used for reference and allele frequencies. Need to provide the field
// (reference/alternate) where to check the frequency
}
if (isValidParam(query, VariantQueryParams.ANNOT_POPULATION_REFERENCE_FREQUENCY)) {
String value = query.getString(VariantQueryParams.ANNOT_POPULATION_REFERENCE_FREQUENCY.key());
addFrequencyFilter(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.POPULATION_FREQUENCIES_FIELD,
DocumentToVariantAnnotationConverter.POPULATION_FREQUENCY_REFERENCE_FREQUENCY_FIELD, value, builder,
VariantQueryParams.ANNOT_POPULATION_REFERENCE_FREQUENCY); // Same
// method addFrequencyFilter is used for reference and allele frequencies. Need to provide the field
// (reference/alternate) where to check the frequency
}
if (isValidParam(query, VariantQueryParams.ANNOT_POPULATION_MINOR_ALLELE_FREQUENCY)) {
String value = query.getString(VariantQueryParams.ANNOT_POPULATION_MINOR_ALLELE_FREQUENCY.key());
addFrequencyFilter(DocumentToVariantConverter.ANNOTATION_FIELD + "."
+ DocumentToVariantAnnotationConverter.POPULATION_FREQUENCIES_FIELD,
value, builder, VariantQueryParams.ANNOT_POPULATION_MINOR_ALLELE_FREQUENCY,
(v, queryBuilder) -> {
String op = getOperator(v);
String obj = v.replaceFirst(op, "");
double aDouble = Double.parseDouble(obj);
switch (op) {
case "<":
queryBuilder.or(QueryBuilder.start(DocumentToVariantAnnotationConverter.
POPULATION_FREQUENCY_REFERENCE_FREQUENCY_FIELD).lessThan(aDouble).get(),
QueryBuilder.start(DocumentToVariantAnnotationConverter.
POPULATION_FREQUENCY_ALTERNATE_FREQUENCY_FIELD).lessThan(aDouble).get()
);
break;
case "<=":
queryBuilder.or(QueryBuilder.start(DocumentToVariantAnnotationConverter.
POPULATION_FREQUENCY_REFERENCE_FREQUENCY_FIELD).lessThanEquals(aDouble).get(),
QueryBuilder.start(DocumentToVariantAnnotationConverter.
POPULATION_FREQUENCY_ALTERNATE_FREQUENCY_FIELD).lessThanEquals(aDouble).get()
);
break;
case ">":
queryBuilder.and(DocumentToVariantAnnotationConverter.
POPULATION_FREQUENCY_REFERENCE_FREQUENCY_FIELD).greaterThan(aDouble)
.and(DocumentToVariantAnnotationConverter.
POPULATION_FREQUENCY_ALTERNATE_FREQUENCY_FIELD).greaterThan(aDouble);
break;
case ">=":
queryBuilder.and(DocumentToVariantAnnotationConverter.
POPULATION_FREQUENCY_REFERENCE_FREQUENCY_FIELD).greaterThanEquals(aDouble)
.and(DocumentToVariantAnnotationConverter.
POPULATION_FREQUENCY_ALTERNATE_FREQUENCY_FIELD).greaterThanEquals(aDouble);
break;
default:
throw new IllegalArgumentException("Unsupported operator '" + op + "'");
}
});
}
}
}
private StudyConfiguration parseStudyQueryParams(Query query, QueryBuilder builder) {
if (query != null) {
Map<String, Integer> studies = getStudyConfigurationManager().getStudies(null);
boolean singleStudy = studies.size() == 1;
boolean validStudiesFilter = isValidParam(query, VariantQueryParams.STUDIES);
// SAMPLES filter will add a FILES filter if absent
boolean validFilesFilter = isValidParam(query, VariantQueryParams.FILES) || isValidParam(query, VariantQueryParams.SAMPLES);
boolean otherFilters =
isValidParam(query, VariantQueryParams.FILES)
|| isValidParam(query, VariantQueryParams.GENOTYPE)
|| isValidParam(query, VariantQueryParams.SAMPLES)
|| isValidParam(query, VariantQueryParams.FILTER);
// Use an elemMatch with all the study filters if there is more than one study registered,
// or FILES and STUDIES filters are being used.
// If filters STUDIES+FILES is used, elemMatch is required to use the index correctly. See #493
boolean studyElemMatch = (!singleStudy || (validFilesFilter && validStudiesFilter));
// If only studyId filter is being used, elemMatch is not needed
if (validStudiesFilter && !otherFilters) {
studyElemMatch = false;
}
// If using an elemMatch for the study, keys don't need to start with "studies"
String studyQueryPrefix = studyElemMatch ? "" : DocumentToVariantConverter.STUDIES_FIELD + '.';
QueryBuilder studyBuilder = QueryBuilder.start();
final StudyConfiguration defaultStudyConfiguration = utils.getDefaultStudyConfiguration(query, null);
if (isValidParam(query, VariantQueryParams.STUDIES)) {
String sidKey = DocumentToVariantConverter.STUDIES_FIELD + '.' + DocumentToStudyVariantEntryConverter.STUDYID_FIELD;
String value = query.getString(VariantQueryParams.STUDIES.key());
// Check that the study exists
QueryOperation studiesOperation = checkOperator(value);
List<String> studiesNames = splitValue(value, studiesOperation);
List<Integer> studyIds = utils.getStudyIds(studiesNames, studies); // Non negated studyIds
// If the Studies query has an AND operator or includes negated fields, it can not be represented only
// in the "elemMatch". It needs to be in the root
boolean anyNegated = studiesNames.stream().anyMatch(VariantDBAdaptorUtils::isNegated);
boolean studyFilterAtRoot = studiesOperation == QueryOperation.AND || anyNegated;
if (studyFilterAtRoot) {
addQueryFilter(sidKey, value, builder, QueryOperation.AND, study -> utils.getStudyId(study, false, studies));
}
// Add all non negated studies to the elemMatch builder if it is being used,
// or it is not and it has not been added to the root
if (studyElemMatch || !studyFilterAtRoot) {
if (!studyIds.isEmpty()) {
if (!singleStudy || anyNegated || validFilesFilter) {
String studyIdsCsv = studyIds.stream().map(Object::toString).collect(Collectors.joining(","));
addQueryIntegerFilter(studyQueryPrefix + DocumentToStudyVariantEntryConverter.STUDYID_FIELD, studyIdsCsv,
studyBuilder, QueryOperation.AND);
} // There is only one study! We can skip this filter
}
}
}
if (isValidParam(query, VariantQueryParams.FILES)) {
addQueryFilter(studyQueryPrefix + DocumentToStudyVariantEntryConverter.FILES_FIELD
+ '.' + DocumentToStudyVariantEntryConverter.FILEID_FIELD,
query.getString(VariantQueryParams.FILES.key()), studyBuilder, QueryOperation.AND,
f -> utils.getFileId(f, false, defaultStudyConfiguration));
}
if (isValidParam(query, VariantQueryParams.FILTER)) {
String filesValue = query.getString(VariantQueryParams.FILES.key());
QueryOperation filesOperation = checkOperator(filesValue);
List<String> fileNames = splitValue(filesValue, filesOperation);
List<Integer> fileIds = utils.getFileIds(fileNames, true, defaultStudyConfiguration);
String fileQueryPrefix;
if (fileIds.isEmpty()) {
fileQueryPrefix = studyQueryPrefix + DocumentToStudyVariantEntryConverter.FILES_FIELD + '.';
addQueryStringFilter(fileQueryPrefix + DocumentToStudyVariantEntryConverter.ATTRIBUTES_FIELD + '.' + StudyEntry.FILTER,
query.getString(VariantQueryParams.FILTER.key()), studyBuilder, QueryOperation.AND);
} else {
QueryBuilder fileBuilder = QueryBuilder.start();
addQueryStringFilter(DocumentToStudyVariantEntryConverter.ATTRIBUTES_FIELD + '.' + StudyEntry.FILTER,
query.getString(VariantQueryParams.FILTER.key()), fileBuilder, QueryOperation.AND);
fileBuilder.and(DocumentToStudyVariantEntryConverter.FILEID_FIELD).in(fileIds);
studyBuilder.and(studyQueryPrefix + DocumentToStudyVariantEntryConverter.FILES_FIELD).elemMatch(fileBuilder.get());
}
}
Map<Object, List<String>> genotypesFilter = new HashMap<>();
if (isValidParam(query, VariantQueryParams.GENOTYPE)) {
String sampleGenotypes = query.getString(VariantQueryParams.GENOTYPE.key());
parseGenotypeFilter(sampleGenotypes, genotypesFilter);
}
if (isValidParam(query, VariantQueryParams.SAMPLES)) {
Set<Integer> files = new HashSet<>();
String samples = query.getString(VariantQueryParams.SAMPLES.key());
for (String sample : samples.split(",")) {
int sampleId = utils.getSampleId(sample, defaultStudyConfiguration);
genotypesFilter.put(sampleId, Arrays.asList(
"1",
"0/1", "0|1", "1|0",
"1/1", "1|1",
"1/2", "1|2", "2|1"
));
if (!isValidParam(query, VariantQueryParams.FILES) && defaultStudyConfiguration != null) {
for (Integer file : defaultStudyConfiguration.getIndexedFiles()) {
if (defaultStudyConfiguration.getSamplesInFiles().get(file).contains(sampleId)) {
files.add(file);
}
}
}
}
// If there is no valid files filter, add files filter to speed up this query
if (!isValidParam(query, VariantQueryParams.FILES) && !files.isEmpty()) {
addQueryFilter(studyQueryPrefix + DocumentToStudyVariantEntryConverter.FILES_FIELD
+ '.' + DocumentToStudyVariantEntryConverter.FILEID_FIELD, files, studyBuilder, QueryOperation.AND,
f -> utils.getFileId(f, false, defaultStudyConfiguration));
}
}
if (!genotypesFilter.isEmpty()) {
for (Map.Entry<Object, List<String>> entry : genotypesFilter.entrySet()) {
Object sample = entry.getKey();
List<String> genotypes = entry.getValue();
int sampleId = utils.getSampleId(sample, defaultStudyConfiguration);
QueryBuilder genotypesBuilder = QueryBuilder.start();
List<String> defaultGenotypes;
if (defaultStudyConfiguration != null) {
defaultGenotypes = defaultStudyConfiguration.getAttributes().getAsStringList(DEFAULT_GENOTYPE.key());
} else {
defaultGenotypes = Arrays.asList("0/0", "0|0");
}
for (String genotype : genotypes) {
boolean negated = isNegated(genotype);
if (negated) {
genotype = genotype.substring(1);
}
if (defaultGenotypes.contains(genotype)) {
List<String> otherGenotypes = Arrays.asList(
"0/0", "0|0",
"0/1", "1/0", "1/1", "-1/-1",
"0|1", "1|0", "1|1", "-1|-1",
"0|2", "2|0", "2|1", "1|2", "2|2",
"0/2", "2/0", "2/1", "1/2", "2/2",
DocumentToSamplesConverter.UNKNOWN_GENOTYPE);
if (negated) {
for (String otherGenotype : otherGenotypes) {
if (defaultGenotypes.contains(otherGenotype)) {
continue;
}
String key = studyQueryPrefix
+ DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD
+ '.' + otherGenotype;
genotypesBuilder.or(new BasicDBObject(key, sampleId));
}
} else {
QueryBuilder andBuilder = QueryBuilder.start();
for (String otherGenotype : otherGenotypes) {
if (defaultGenotypes.contains(otherGenotype)) {
continue;
}
String key = studyQueryPrefix
+ DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD
+ '.' + otherGenotype;
andBuilder.and(new BasicDBObject(key,
new Document("$ne", sampleId)));
}
genotypesBuilder.or(andBuilder.get());
}
} else {
String s = studyQueryPrefix
+ DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD
+ '.' + DocumentToSamplesConverter.genotypeToStorageType(genotype);
if (negated) {
//and [ {"gt.0|1" : { $ne : <sampleId> } } ]
genotypesBuilder.and(new BasicDBObject(s, new BasicDBObject("$ne", sampleId)));
} else {
//or [ {"gt.0|1" : <sampleId> } ]
genotypesBuilder.or(new BasicDBObject(s, sampleId));
}
}
}
studyBuilder.and(genotypesBuilder.get());
}
}
// If Study Query is used then we add a elemMatch query
DBObject studyQuery = studyBuilder.get();
if (!studyQuery.keySet().isEmpty()) {
if (studyElemMatch) {
builder.and(DocumentToVariantConverter.STUDIES_FIELD).elemMatch(studyQuery);
} else {
builder.and(studyQuery);
}
}
return defaultStudyConfiguration;
} else {
return null;
}
}
private void parseStatsQueryParams(Query query, QueryBuilder builder, StudyConfiguration defaultStudyConfiguration) {
if (query != null) {
if (query.get(VariantQueryParams.COHORTS.key()) != null && !query.getString(VariantQueryParams.COHORTS.key()).isEmpty()) {
addQueryFilter(DocumentToVariantConverter.STATS_FIELD
+ "." + DocumentToVariantStatsConverter.COHORT_ID,
query.getString(VariantQueryParams.COHORTS.key()), builder, QueryOperation.AND,
s -> {
try {
return Integer.parseInt(s);
} catch (NumberFormatException ignore) {
int indexOf = s.lastIndexOf(":");
if (defaultStudyConfiguration == null && indexOf < 0) {
throw VariantQueryException.malformedParam(VariantQueryParams.COHORTS, s, "Expected {study}:{cohort}");
} else {
String study;
String cohort;
Integer cohortId;
if (defaultStudyConfiguration != null && indexOf < 0) {
cohort = s;
cohortId = utils.getCohortId(cohort, defaultStudyConfiguration);
} else {
study = s.substring(0, indexOf);
cohort = s.substring(indexOf + 1);
StudyConfiguration studyConfiguration =
utils.getStudyConfiguration(study, defaultStudyConfiguration);
cohortId = utils.getCohortId(cohort, studyConfiguration);
}
return cohortId;
}
}
});
}
if (query.get(VariantQueryParams.STATS_MAF.key()) != null && !query.getString(VariantQueryParams.STATS_MAF.key()).isEmpty()) {
addStatsFilterList(DocumentToVariantStatsConverter.MAF_FIELD, query.getString(VariantQueryParams.STATS_MAF.key()),
builder, defaultStudyConfiguration);
}
if (query.get(VariantQueryParams.STATS_MGF.key()) != null && !query.getString(VariantQueryParams.STATS_MGF.key()).isEmpty()) {
addStatsFilterList(DocumentToVariantStatsConverter.MGF_FIELD, query.getString(VariantQueryParams.STATS_MGF.key()),
builder, defaultStudyConfiguration);
}
if (query.get(VariantQueryParams.MISSING_ALLELES.key()) != null && !query.getString(VariantQueryParams.MISSING_ALLELES.key())
.isEmpty()) {
addStatsFilterList(DocumentToVariantStatsConverter.MISSALLELE_FIELD, query.getString(VariantQueryParams.MISSING_ALLELES
.key()), builder, defaultStudyConfiguration);
}
if (query.get(VariantQueryParams.MISSING_GENOTYPES.key()) != null && !query.getString(VariantQueryParams.MISSING_GENOTYPES
.key()).isEmpty()) {
addStatsFilterList(DocumentToVariantStatsConverter.MISSGENOTYPE_FIELD, query.getString(VariantQueryParams
.MISSING_GENOTYPES.key()), builder, defaultStudyConfiguration);
}
if (query.get("numgt") != null && !query.getString("numgt").isEmpty()) {
for (String numgt : query.getAsStringList("numgt")) {
String[] split = numgt.split(":");
addCompQueryFilter(
DocumentToVariantConverter.STATS_FIELD + "." + DocumentToVariantStatsConverter.NUMGT_FIELD + "." + split[0],
split[1], builder, false);
}
}
}
}
private Document createProjection(Query query, QueryOptions options) {
Document projection = new Document();
if (options == null) {
options = new QueryOptions();
}
if (options.containsKey(QueryOptions.SORT) && !options.getString(QueryOptions.SORT).equals("_id")) {
if (options.getBoolean(QueryOptions.SORT)) {
options.put(QueryOptions.SORT, "_id");
options.putIfAbsent(QueryOptions.ORDER, QueryOptions.ASCENDING);
} else {
options.remove(QueryOptions.SORT);
}
}
Set<VariantField> returnedFields = VariantField.getReturnedFields(options);
// Add all required fields
returnedFields.addAll(DocumentToVariantConverter.REQUIRED_FIELDS_SET);
if (returnedFields.contains(VariantField.STUDIES) && !returnedFields.contains(VariantField.STUDIES_STUDY_ID)) {
returnedFields.add(VariantField.STUDIES_STUDY_ID);
}
returnedFields = VariantField.prune(returnedFields);
if (!returnedFields.isEmpty()) { //Include some
for (VariantField s : returnedFields) {
List<String> keys = DocumentToVariantConverter.toShortFieldName(s);
// String key = DocumentToVariantConverter.toShortFieldName(s.fieldName());
if (keys != null) {
for (String key : keys) {
projection.put(key, 1);
}
} else {
logger.warn("Unknown include field: {}", s);
}
}
}
// if (query.containsKey(VariantQueryParams.RETURNED_FILES.key()) && projection.containsKey(DocumentToVariantConverter
// .STUDIES_FIELD)) {
// List<Integer> files = query.getAsIntegerList(VariantQueryParams.RETURNED_FILES.key());
// projection.put(
// DocumentToVariantConverter.STUDIES_FIELD,
// new Document(
// "$elemMatch",
// new Document(
// DocumentToStudyVariantEntryConverter.FILES_FIELD + "." + DocumentToStudyVariantEntryConverter
// .FILEID_FIELD,
// new Document(
// "$in",
// files
// )
// )
// )
// );
// }
List<Integer> studiesIds = utils.getReturnedStudies(query, options);
// Use elemMatch only if there is one study to return.
if (studiesIds.size() == 1) {
projection.put(
DocumentToVariantConverter.STUDIES_FIELD,
new Document(
"$elemMatch",
new Document(
DocumentToStudyVariantEntryConverter.STUDYID_FIELD,
new Document(
"$in",
studiesIds
)
)
)
);
}
logger.debug("QueryOptions: = {}", options.toJson());
logger.debug("Projection: = {}", projection.toJson(new JsonWriterSettings(JsonMode.SHELL, false)));
return projection;
}
/**
* Two steps insertion:
* First check that the variant and study exists making an update.
* For those who doesn't exist, pushes a study with the file and genotype information
* <p>
* The documents that throw a "dup key" exception are those variants that exist and have the study.
* Then, only for those variants, make a second update.
* <p>
* *An interesting idea would be to invert this actions depending on the number of already inserted variants.
*
* @param data Variants to insert
* @param fileId File ID
* @param variantConverter Variant converter to be used
* @param variantSourceEntryConverter Variant source converter to be used
* @param studyConfiguration Configuration for the study
* @param loadedSampleIds Other loaded sampleIds EXCEPT those that are going to be loaded
* @return QueryResult object
*/
QueryResult<MongoDBVariantWriteResult> insert(List<Variant> data, int fileId, DocumentToVariantConverter variantConverter,
DocumentToStudyVariantEntryConverter variantSourceEntryConverter,
StudyConfiguration studyConfiguration, List<Integer> loadedSampleIds) {
MongoDBVariantWriteResult writeResult = new MongoDBVariantWriteResult();
long startTime = System.currentTimeMillis();
if (data.isEmpty()) {
return new QueryResult<>("insertVariants", 0, 1, 1, "", "", Collections.singletonList(writeResult));
}
List<Bson> queries = new ArrayList<>(data.size());
List<Bson> updates = new ArrayList<>(data.size());
// Use a multiset instead of a normal set, to keep tracking of duplicated variants
Multiset<String> nonInsertedVariants = HashMultiset.create();
String fileIdStr = Integer.toString(fileId);
// List<String> extraFields = studyConfiguration.getAttributes().getAsStringList(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS
// .key());
boolean excludeGenotypes = studyConfiguration.getAttributes().getBoolean(VariantStorageEngine.Options.EXCLUDE_GENOTYPES.key(),
VariantStorageEngine.Options.EXCLUDE_GENOTYPES.defaultValue());
long nanoTime = System.nanoTime();
Map missingSamples = Collections.emptyMap();
String defaultGenotype = studyConfiguration.getAttributes().getString(DEFAULT_GENOTYPE.key(), "");
if (defaultGenotype.equals(DocumentToSamplesConverter.UNKNOWN_GENOTYPE)) {
logger.debug("Do not need fill gaps. DefaultGenotype is UNKNOWN_GENOTYPE({}).", DocumentToSamplesConverter.UNKNOWN_GENOTYPE);
} else if (excludeGenotypes) {
logger.debug("Do not need fill gaps. Excluding genotypes.");
} else if (!loadedSampleIds.isEmpty()) {
missingSamples = new Document(DocumentToSamplesConverter.UNKNOWN_GENOTYPE, loadedSampleIds); // ?/?
}
// List<Object> missingOtherValues = new ArrayList<>(loadedSampleIds.size());
// for (int i = 0; i < loadedSampleIds.size(); i++) {
// missingOtherValues.add(DBObjectToSamplesConverter.UNKNOWN_FIELD);
// }
for (Variant variant : data) {
if (variant.getType().equals(VariantType.NO_VARIATION)) {
//Storage-MongoDB is not able to store NON VARIANTS
writeResult.setSkippedVariants(writeResult.getSkippedVariants() + 1);
continue;
} else if (variant.getType().equals(VariantType.SYMBOLIC)) {
logger.warn("Skip symbolic variant " + variant.toString());
writeResult.setSkippedVariants(writeResult.getSkippedVariants() + 1);
continue;
}
String id = variantConverter.buildStorageId(variant);
for (StudyEntry studyEntry : variant.getStudies()) {
if (studyEntry.getFiles().size() == 0 || !studyEntry.getFiles().get(0).getFileId().equals(fileIdStr)) {
continue;
}
int studyId = studyConfiguration.getStudyId();
Document study = variantSourceEntryConverter.convertToStorageType(variant, studyEntry);
Document genotypes = study.get(DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD, Document.class);
if (genotypes != null) { //If genotypes is null, genotypes are not suppose to be loaded
genotypes.putAll(missingSamples); //Add missing samples
// for (String extraField : extraFields) {
// List<Object> otherFieldValues = (List<Object>) study.get(extraField.toLowerCase());
// otherFieldValues.addAll(0, missingOtherValues);
// }
}
Document push = new Document(DocumentToVariantConverter.STUDIES_FIELD, study);
Document update = new Document()
.append("$push", push)
.append("$setOnInsert", variantConverter.convertToStorageType(variant));
if (variant.getIds() != null && !variant.getIds().isEmpty() && !variant.getIds().iterator().next().isEmpty()) {
update.put("$addToSet", new Document(DocumentToVariantConverter.IDS_FIELD, new Document("$each",
variant.getIds())));
}
// { _id: <variant_id>, "studies.sid": {$ne: <studyId> } }
//If the variant exists and contains the study, this find will fail, will try to do the upsert, and throw a
// duplicated key exception.
queries.add(new Document("_id", id).append(DocumentToVariantConverter.STUDIES_FIELD
+ "." + DocumentToStudyVariantEntryConverter.STUDYID_FIELD,
new Document("$ne", studyId)));
updates.add(update);
}
}
//
if (!queries.isEmpty()) {
QueryOptions options = new QueryOptions(UPSERT, true);
options.put(MULTI, false);
int newDocuments;
int updatedObjects;
try {
BulkWriteResult bulkWriteResult;
bulkWriteResult = variantsCollection.update(queries, updates, options).first();
newDocuments = bulkWriteResult.getUpserts().size();
updatedObjects = bulkWriteResult.getModifiedCount();
} catch (MongoBulkWriteException e) {
BulkWriteResult bulkWriteResult;
bulkWriteResult = e.getWriteResult();
newDocuments = bulkWriteResult.getUpserts().size();
updatedObjects = bulkWriteResult.getModifiedCount();
for (BulkWriteError writeError : e.getWriteErrors()) {
if (writeError.getCode() == 11000) { //Dup Key error code
Matcher matcher = writeResultErrorPattern.matcher(writeError.getMessage());
if (matcher.find()) {
String id = matcher.group(1);
nonInsertedVariants.add(id);
} else {
throw e;
}
} else {
throw e;
}
}
}
writeResult.setNewVariants(newDocuments);
writeResult.setUpdatedVariants(updatedObjects);
// writeResult.setNewDocuments(data.size() - nonInsertedVariants.size() - writeResult.getSkippedVariants());
queries.clear();
updates.clear();
}
writeResult.setNewVariantsNanoTime(System.nanoTime() - nanoTime);
nanoTime = System.nanoTime();
for (Variant variant : data) {
variant.setAnnotation(null);
String id = variantConverter.buildStorageId(variant);
if (nonInsertedVariants != null && !nonInsertedVariants.contains(id)) {
continue; //Already inserted variant
}
for (StudyEntry studyEntry : variant.getStudies()) {
if (studyEntry.getFiles().size() == 0 || !studyEntry.getFiles().get(0).getFileId().equals(fileIdStr)) {
continue;
}
Document studyObject = variantSourceEntryConverter.convertToStorageType(variant, studyEntry);
Document genotypes = studyObject.get(DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD, Document.class);
Document push = new Document();
if (!excludeGenotypes) {
if (genotypes != null) { //If genotypes is null, genotypes are not suppose to be loaded
for (String genotype : genotypes.keySet()) {
push.put(DocumentToVariantConverter.STUDIES_FIELD + ".$." + DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD
+ "." + genotype, new Document("$each", genotypes.get(genotype)));
}
// for (String extraField : extraFields) {
// List values = (List) studyObject.get(extraField.toLowerCase());
// push.put(DBObjectToVariantConverter.STUDIES_FIELD + ".$." + extraField.toLowerCase(),
// new Document("$each", values).append("$position", loadedSampleIds.size()));
// }
} else {
push.put(DocumentToVariantConverter.STUDIES_FIELD + ".$." + DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD,
Collections.emptyMap());
}
}
push.put(DocumentToVariantConverter.STUDIES_FIELD + ".$." + DocumentToStudyVariantEntryConverter.FILES_FIELD, ((List)
studyObject.get(DocumentToStudyVariantEntryConverter.FILES_FIELD)).get(0));
Document update = new Document(new Document("$push", push));
queries.add(new Document("_id", id)
.append(DocumentToVariantConverter.STUDIES_FIELD
+ '.' + DocumentToStudyVariantEntryConverter.STUDYID_FIELD, studyConfiguration.getStudyId())
.append(DocumentToVariantConverter.STUDIES_FIELD
+ '.' + DocumentToStudyVariantEntryConverter.FILES_FIELD
+ '.' + DocumentToStudyVariantEntryConverter.FILEID_FIELD, new Document("$ne", fileId))
);
updates.add(update);
}
}
writeResult.setExistingVariantsNanoTime(System.nanoTime() - nanoTime);
if (!queries.isEmpty()) {
QueryOptions options = new QueryOptions(UPSERT, false);
options.put(MULTI, false);
QueryResult<BulkWriteResult> update = variantsCollection.update(queries, updates, options);
// Can happen that nonInsertedVariantsNum != queries.size() != nonInsertedVariants.size() if there was
// a duplicated variant.
writeResult.setNonInsertedVariants(nonInsertedVariants.size() - update.first().getMatchedCount());
writeResult.setUpdatedVariants(writeResult.getUpdatedVariants() + update.first().getModifiedCount());
}
return new QueryResult<>("insertVariants", ((int) (System.currentTimeMillis() - startTime)), 1, 1, "", "",
Collections.singletonList(writeResult));
}
/**
* Fills the missing genotype values for the new loaded samples.
* Missing data is which was present in the database but not in the input file.
* Data present in the file but not in the database is added during the {@link #insert} step.
* <p>
* +--------+---------+
* | Loaded | NewFile |
* +--------+--------+---------+
* | 10:A:T | DATA | | <- Missing data to be filled
* +--------+--------+---------+
* | 20:C:T | | DATA | <- Missing data already filled in the {@link #insert} step.
* +--------+--------+---------+
*
* @param fileId Loading File ID
* @param chromosomes Chromosomes covered by the current file
* @param fileSampleIds FileSampleIds
* @param studyConfiguration StudyConfiguration
* @return WriteResult
*/
QueryResult<UpdateResult> fillFileGaps(int fileId, List<String> chromosomes, List<Integer> fileSampleIds,
StudyConfiguration studyConfiguration) {
// { "studies.sid" : <studyId>, "studies.files.fid" : { $ne : <fileId> } },
// { $push : {
// "studies.$.gt.?/?" : {$each : [ <fileSampleIds> ] }
// } }
if (studyConfiguration.getAttributes().getAsStringList(DEFAULT_GENOTYPE.key(), "")
.equals(Collections.singletonList(DocumentToSamplesConverter.UNKNOWN_GENOTYPE))
// && studyConfiguration.getAttributes().getAsStringList(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key()).isEmpty()
) {
// Check if the default genotype is the unknown genotype. In that case, is not required to fill missing genotypes.
// Previously, also checks if there where EXTRA_GENOTYPE_FIELDS like DP:AD,... . In that case, those arrays had to be filled.
logger.debug("Do not need fill gaps. DefaultGenotype is UNKNOWN_GENOTYPE({}).", DocumentToSamplesConverter.UNKNOWN_GENOTYPE);
return new QueryResult<>();
} else if (studyConfiguration.getAttributes().getBoolean(VariantStorageEngine.Options.EXCLUDE_GENOTYPES.key(),
VariantStorageEngine.Options.EXCLUDE_GENOTYPES.defaultValue())) {
// Check if the genotypes are not required. In that case, no fillGaps is needed
logger.debug("Do not need fill gaps. Exclude genotypes.");
return new QueryResult<>();
} else {
BiMap<String, Integer> indexedSamples = StudyConfiguration.getIndexedSamples(studyConfiguration);
if (indexedSamples.isEmpty() || indexedSamples.values().equals(new HashSet<>(fileSampleIds))) {
// If the loaded samples match with the current samples means that there where no other samples loaded.
// There were no gaps, so it is not needed to fill anything.
logger.debug("Do not need fill gaps. First sample batch.");
return new QueryResult<>();
}
}
logger.debug("Do fill gaps.");
Document query = new Document();
if (chromosomes != null && !chromosomes.isEmpty()) {
query.put(DocumentToVariantConverter.CHROMOSOME_FIELD, new Document("$in", chromosomes));
}
query.put(DocumentToVariantConverter.STUDIES_FIELD, new Document("$elemMatch",
new Document(
DocumentToStudyVariantEntryConverter.STUDYID_FIELD,
studyConfiguration.getStudyId())
.append(DocumentToStudyVariantEntryConverter.FILES_FIELD + "." + DocumentToStudyVariantEntryConverter.FILEID_FIELD,
new Document("$ne", fileId)
)
));
Document push = new Document()
.append(DocumentToVariantConverter.STUDIES_FIELD
+ ".$." + DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD
+ "." + DocumentToSamplesConverter.UNKNOWN_GENOTYPE, new Document("$each", fileSampleIds));
// List<Integer> loadedSamples = getLoadedSamples(fileId, studyConfiguration);
// List<Object> missingOtherValues = new ArrayList<>(fileSampleIds.size());
// for (int size = fileSampleIds.size(); size > 0; size--) {
// missingOtherValues.add(DBObjectToSamplesConverter.UNKNOWN_FIELD);
// }
// List<String> extraFields = studyConfiguration.getAttributes()
// .getAsStringList(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key());
// for (String extraField : extraFields) {
// push.put(DBObjectToVariantConverter.STUDIES_FIELD + ".$." + extraField.toLowerCase(),
// new Document("$each", missingOtherValues).append("$position", loadedSamples.size())
// );
// }
Document update = new Document("$push", push);
QueryOptions queryOptions = new QueryOptions(MULTI, true);
logger.debug("FillGaps find : {}", query);
logger.debug("FillGaps update : {}", update);
return variantsCollection.update(query, update, queryOptions);
}
private DocumentToVariantConverter getDocumentToVariantConverter(Query query, QueryOptions options) {
List<Integer> returnedStudies = utils.getReturnedStudies(query, options);
DocumentToSamplesConverter samplesConverter;
samplesConverter = new DocumentToSamplesConverter(studyConfigurationManager);
// Fetch some StudyConfigurations that will be needed
if (returnedStudies != null) {
for (Integer studyId : returnedStudies) {
QueryResult<StudyConfiguration> queryResult = studyConfigurationManager.getStudyConfiguration(studyId, options);
if (queryResult.getResult().isEmpty()) {
throw VariantQueryException.studyNotFound(studyId);
// throw new IllegalArgumentException("Couldn't find studyConfiguration for StudyId '" + studyId + "'");
} else {
samplesConverter.addStudyConfiguration(queryResult.first());
}
}
}
if (query.containsKey(VariantQueryParams.UNKNOWN_GENOTYPE.key())) {
samplesConverter.setReturnedUnknownGenotype(query.getString(VariantQueryParams.UNKNOWN_GENOTYPE.key()));
}
Set<VariantField> fields = VariantField.getReturnedFields(options);
samplesConverter.setReturnedSamples(getReturnedSamples(query, options));
DocumentToStudyVariantEntryConverter studyEntryConverter;
Collection<Integer> returnedFiles = utils.getReturnedFiles(query, options, fields);
studyEntryConverter = new DocumentToStudyVariantEntryConverter(false, returnedFiles, samplesConverter);
studyEntryConverter.setStudyConfigurationManager(studyConfigurationManager);
return new DocumentToVariantConverter(studyEntryConverter,
new DocumentToVariantStatsConverter(studyConfigurationManager), returnedStudies);
}
private QueryBuilder addQueryStringFilter(String key, String value, final QueryBuilder builder, QueryOperation op) {
return this.addQueryFilter(key, value, builder, op, Function.identity());
}
private QueryBuilder addQueryIntegerFilter(String key, String value, final QueryBuilder builder, QueryOperation op) {
return this.<Integer>addQueryFilter(key, value, builder, op, elem -> {
try {
return Integer.parseInt(elem);
} catch (NumberFormatException e) {
throw new VariantQueryException("Unable to parse int " + elem, e);
}
});
}
private <T> QueryBuilder addQueryFilter(String key, Collection<?> value, final QueryBuilder builder, QueryOperation op,
Function<String, T> map) {
return addQueryFilter(key, value.stream().map(Object::toString).collect(Collectors.joining(AND)), builder, op, map);
}
private <T> QueryBuilder addQueryFilter(String key, String value, final QueryBuilder builder, QueryOperation op,
Function<String, T> map) {
QueryOperation operation = checkOperator(value);
QueryBuilder auxBuilder;
if (op == QueryOperation.OR) {
auxBuilder = QueryBuilder.start();
} else {
auxBuilder = builder;
}
if (operation == null) {
if (value.startsWith("!")) {
T mapped = map.apply(value.substring(1));
if (mapped instanceof Collection) {
auxBuilder.and(key).notIn(mapped);
} else {
auxBuilder.and(key).notEquals(mapped);
}
} else {
T mapped = map.apply(value);
if (mapped instanceof Collection) {
auxBuilder.and(key).in(mapped);
} else {
auxBuilder.and(key).is(mapped);
}
}
} else if (operation == QueryOperation.OR) {
String[] array = value.split(OR);
List list = new ArrayList(array.length);
for (String elem : array) {
if (elem.startsWith("!")) {
throw new VariantQueryException("Unable to use negate (!) operator in OR sequences (<it_1>(,<it_n>)*)");
} else {
T mapped = map.apply(elem);
if (mapped instanceof Collection) {
list.addAll(((Collection) mapped));
} else {
list.add(mapped);
}
}
}
auxBuilder.and(key).in(list);
} else {
//Split in two lists: positive and negative
String[] array = value.split(AND);
List listIs = new ArrayList(array.length);
List listNotIs = new ArrayList(array.length);
for (String elem : array) {
if (elem.startsWith("!")) {
T mapped = map.apply(elem.substring(1));
if (mapped instanceof Collection) {
listNotIs.addAll(((Collection) mapped));
} else {
listNotIs.add(mapped);
}
} else {
T mapped = map.apply(elem);
if (mapped instanceof Collection) {
listIs.addAll(((Collection) mapped));
} else {
listIs.add(mapped);
}
}
}
if (!listIs.isEmpty()) { //Can not use method "is" because it will be overwritten with the "notEquals" or "notIn" method
auxBuilder.and(key).all(listIs);
}
if (listNotIs.size() == 1) {
auxBuilder.and(key).notEquals(listNotIs.get(0));
} else if (listNotIs.size() > 1) {
auxBuilder.and(key).notIn(listNotIs);
}
}
if (op == QueryOperation.OR) {
builder.or(auxBuilder.get());
}
return builder;
}
/**
* Accept a list of comparative filters separated with "," or ";" with the expression:
* {OPERATION}{VALUE}, where the accepted operations are: <, <=, >, >=, =, ==, !=, ~=.
*
* @param key
* @param value
* @param builder
* @param extendKey
* @return
*/
private QueryBuilder addCompListQueryFilter(String key, String value, QueryBuilder builder, boolean extendKey) {
QueryOperation op = checkOperator(value);
List<String> values = splitValue(value, op);
QueryBuilder compBuilder;
if (op == QueryOperation.OR) {
compBuilder = QueryBuilder.start();
} else {
compBuilder = builder;
}
for (String elem : values) {
addCompQueryFilter(key, elem, compBuilder, extendKey);
}
if (op == QueryOperation.OR) {
builder.or(compBuilder.get());
}
return builder;
}
private QueryBuilder addCompQueryFilter(String key, String value, QueryBuilder builder, boolean extendKey) {
String[] strings = splitKeyOpValue(value);
String op = "";
if (strings.length == 3) {
if (extendKey && !strings[0].isEmpty()) {
key = key + "." + strings[0];
}
value = strings[2];
op = strings[1];
}
return addCompQueryFilter(key, value, builder, op);
}
private QueryBuilder addCompQueryFilter(String key, String obj, QueryBuilder builder, String op) {
switch (op) {
case "<":
builder.and(key).lessThan(Double.parseDouble(obj));
break;
case "<=":
builder.and(key).lessThanEquals(Double.parseDouble(obj));
break;
case ">":
builder.and(key).greaterThan(Double.parseDouble(obj));
break;
case ">=":
builder.and(key).greaterThanEquals(Double.parseDouble(obj));
break;
case "=":
case "==":
try {
builder.and(key).is(Double.parseDouble(obj));
} catch (NumberFormatException e) {
builder.and(key).is(obj);
}
break;
case "!=":
builder.and(key).notEquals(Double.parseDouble(obj));
break;
case "~=":
case "~":
builder.and(key).regex(Pattern.compile(obj));
break;
default:
break;
}
return builder;
}
private QueryBuilder addStringCompQueryFilter(String key, String value, QueryBuilder builder) {
String op = getOperator(value);
String obj = value.replaceFirst(op, "");
switch (op) {
case "!=":
case "!":
builder.and(key).notEquals(obj);
break;
case "~=":
case "~":
builder.and(key).regex(Pattern.compile(obj));
break;
case "":
case "=":
case "==":
default:
builder.and(key).is(obj);
break;
}
return builder;
}
private String getOperator(String value) {
Matcher matcher = OPERATION_PATTERN.matcher(value);
if (!matcher.find()) {
return "";
} else {
return matcher.group(2);
}
}
/**
* Accepts a list of filters separated with "," or ";" with the expression: {SCORE}{OPERATION}{VALUE}.
*
* @param value Value to parse
* @param builder QueryBuilder
* @param scoreParam Score query param
* @param allowDescriptionFilter Use string values as filters for the score description
* @return QueryBuilder
*/
private QueryBuilder addScoreFilter(String value, QueryBuilder builder, VariantQueryParams scoreParam,
boolean allowDescriptionFilter) {
return addScoreFilter(value, builder, scoreParam, null, allowDescriptionFilter);
}
/**
* Accepts a list of filters separated with "," or ";" with the expression: {SOURCE}{OPERATION}{VALUE}.
*
* @param value Value to parse
* @param builder QueryBuilder
* @param scoreParam Score VariantQueryParam
* @param defaultSource Default source value. If null, must be present in the filter. If not, must not be present.
* @param allowDescriptionFilter Use string values as filters for the score description
* @return QueryBuilder
*/
private QueryBuilder addScoreFilter(String value, QueryBuilder builder, VariantQueryParams scoreParam, final String defaultSource,
boolean allowDescriptionFilter) {
final List<String> list;
QueryOperation operation = checkOperator(value);
list = splitValue(value, operation);
List<DBObject> dbObjects = new ArrayList<>();
for (String elem : list) {
String[] score = VariantDBAdaptorUtils.splitOperator(elem);
String source;
String op;
String scoreValue;
// No given score
if (StringUtils.isEmpty(score[0])) {
if (defaultSource == null) {
logger.error("Bad score filter: " + elem);
throw VariantQueryException.malformedParam(scoreParam, value);
}
source = defaultSource;
op = score[1];
scoreValue = score[2];
} else {
if (defaultSource != null) {
logger.error("Bad score filter: " + elem);
throw VariantQueryException.malformedParam(scoreParam, value);
}
source = score[0];
op = score[1];
scoreValue = score[2];
}
String key = DocumentToVariantAnnotationConverter.SCORE_FIELD_MAP.get(source);
if (key == null) {
// Unknown score
throw VariantQueryException.malformedParam(scoreParam, value);
}
QueryBuilder scoreBuilder = new QueryBuilder();
if (NumberUtils.isParsable(scoreValue)) {
// Query by score
key += '.' + DocumentToVariantAnnotationConverter.SCORE_SCORE_FIELD;
addCompQueryFilter(key, scoreValue, scoreBuilder, op);
} else if (allowDescriptionFilter) {
// Query by description
key += '.' + DocumentToVariantAnnotationConverter.SCORE_DESCRIPTION_FIELD;
addStringCompQueryFilter(key, scoreValue, scoreBuilder);
} else {
throw VariantQueryException.malformedParam(scoreParam, value);
}
dbObjects.add(scoreBuilder.get());
}
if (!dbObjects.isEmpty()) {
if (operation == null || operation == QueryOperation.AND) {
builder.and(dbObjects.toArray(new DBObject[dbObjects.size()]));
} else {
builder.and(new BasicDBObject("$or", dbObjects));
}
}
return builder;
}
/**
* Accepts a list of filters separated with "," or ";" with the expression:
* {STUDY}:{POPULATION}{OPERATION}{VALUE}.
*
* @param key PopulationFrequency schema field
* @param alleleFrequencyField Allele frequency schema field
* @param value Value to parse
* @param builder QueryBuilder
* @param queryParam QueryParam filter
* @return QueryBuilder
*/
private QueryBuilder addFrequencyFilter(String key, String alleleFrequencyField, String value, QueryBuilder builder,
VariantQueryParams queryParam) {
return addFrequencyFilter(key, value, builder, queryParam, (v, qb) -> addCompQueryFilter(alleleFrequencyField, v, qb, false));
}
/**
* Accepts a list of filters separated with "," or ";" with the expression:
* {STUDY}:{POPULATION}{OPERATION}{VALUE}.
*
* @param key PopulationFrequency schema field
* @param value Value to parse
* @param builder QueryBuilder
* @param addFilter For complex filter
* @return QueryBuilder
*/
private QueryBuilder addFrequencyFilter(String key, String value, QueryBuilder builder, VariantQueryParams queryParam,
BiConsumer<String, QueryBuilder> addFilter) {
final List<String> list;
QueryOperation operation = checkOperator(value);
list = splitValue(value, operation);
List<BasicDBObject> dbObjects = new ArrayList<>();
for (String elem : list) {
String[] split = elem.split(IS);
if (split.length != 2) {
logger.error("Bad population frequency filter: " + elem);
throw VariantQueryException.malformedParam(queryParam, value);
//new IllegalArgumentException("Bad population frequency filter: " + elem);
}
String study = split[0];
String population = split[1];
String[] populationFrequency = splitKeyValue(population);
logger.debug("populationFrequency = " + Arrays.toString(populationFrequency));
QueryBuilder frequencyBuilder = new QueryBuilder();
frequencyBuilder.and(DocumentToVariantAnnotationConverter.POPULATION_FREQUENCY_STUDY_FIELD).is(study);
frequencyBuilder.and(DocumentToVariantAnnotationConverter.POPULATION_FREQUENCY_POP_FIELD).is(populationFrequency[0]);
Document studyPopFilter = new Document(frequencyBuilder.get().toMap());
addFilter.accept(populationFrequency[1], frequencyBuilder);
BasicDBObject elemMatch = new BasicDBObject(key, new BasicDBObject("$elemMatch", frequencyBuilder.get()));
if (populationFrequency[1].startsWith("<")) {
BasicDBObject orNotExistsAnyPopulation = new BasicDBObject(key, new BasicDBObject("$exists", false));
BasicDBObject orNotExistsPopulation =
new BasicDBObject(key, new BasicDBObject("$not", new BasicDBObject("$elemMatch", studyPopFilter)));
dbObjects.add(new BasicDBObject("$or", Arrays.asList(orNotExistsAnyPopulation, orNotExistsPopulation, elemMatch)));
} else {
dbObjects.add(elemMatch);
}
}
if (!dbObjects.isEmpty()) {
if (operation == null || operation == QueryOperation.AND) {
builder.and(dbObjects.toArray(new BasicDBObject[dbObjects.size()]));
} else {
builder.and(new BasicDBObject("$or", dbObjects));
}
}
return builder;
}
/**
* Accept filters separated with "," or ";" with the expression:
* [{STUDY}:]{COHORT}{OPERATION}{VALUE}.
* Where STUDY is optional if defaultStudyConfiguration is provided
*
* @param key Stats field to filter
* @param values Values to parse
* @param builder QueryBuilder
* @param defaultStudyConfiguration
*/
private void addStatsFilterList(String key, String values, QueryBuilder builder, StudyConfiguration defaultStudyConfiguration) {
QueryOperation op = checkOperator(values);
List<String> valuesList = splitValue(values, op);
List<DBObject> statsQueries = new LinkedList<>();
for (String value : valuesList) {
statsQueries.add(addStatsFilter(key, value, new QueryBuilder(), defaultStudyConfiguration).get());
}
if (!statsQueries.isEmpty()) {
if (op == QueryOperation.OR) {
builder.or(statsQueries.toArray(new DBObject[statsQueries.size()]));
} else {
builder.and(statsQueries.toArray(new DBObject[statsQueries.size()]));
}
}
}
/**
* Accepts filters with the expresion: [{STUDY}:]{COHORT}{OPERATION}{VALUE}.
* Where STUDY is optional if defaultStudyConfiguration is provided
*
* @param key Stats field to filter
* @param filter Filter to parse
* @param builder QueryBuilder
* @param defaultStudyConfiguration
*/
private QueryBuilder addStatsFilter(String key, String filter, QueryBuilder builder, StudyConfiguration defaultStudyConfiguration) {
if (filter.contains(":") || defaultStudyConfiguration != null) {
Integer studyId;
Integer cohortId;
String operator;
String valueStr;
if (filter.contains(":")) {
String[] studyValue = filter.split(":");
String[] cohortOpValue = VariantDBAdaptorUtils.splitOperator(studyValue[1]);
String study = studyValue[0];
String cohort = cohortOpValue[0];
operator = cohortOpValue[1];
valueStr = cohortOpValue[2];
StudyConfiguration studyConfiguration = utils.getStudyConfiguration(study, defaultStudyConfiguration);
cohortId = utils.getCohortId(cohort, studyConfiguration);
studyId = studyConfiguration.getStudyId();
} else {
// String study = defaultStudyConfiguration.getStudyName();
studyId = defaultStudyConfiguration.getStudyId();
String[] cohortOpValue = VariantDBAdaptorUtils.splitOperator(filter);
String cohort = cohortOpValue[0];
cohortId = utils.getCohortId(cohort, defaultStudyConfiguration);
operator = cohortOpValue[1];
valueStr = cohortOpValue[2];
}
QueryBuilder statsBuilder = new QueryBuilder();
statsBuilder.and(DocumentToVariantStatsConverter.STUDY_ID).is(studyId);
if (cohortId != null) {
statsBuilder.and(DocumentToVariantStatsConverter.COHORT_ID).is(cohortId);
}
addCompQueryFilter(key, valueStr, statsBuilder, operator);
builder.and(DocumentToVariantConverter.STATS_FIELD).elemMatch(statsBuilder.get());
} else {
addCompQueryFilter(DocumentToVariantConverter.STATS_FIELD + "." + key, filter, builder, false);
}
return builder;
}
private QueryBuilder getRegionFilter(Region region, QueryBuilder builder) {
List<String> chunkIds = getChunkIds(region);
builder.and(DocumentToVariantConverter.AT_FIELD + '.' + DocumentToVariantConverter.CHUNK_IDS_FIELD).in(chunkIds);
builder.and(DocumentToVariantConverter.END_FIELD).greaterThanEquals(region.getStart());
builder.and(DocumentToVariantConverter.START_FIELD).lessThanEquals(region.getEnd());
return builder;
}
private QueryBuilder getRegionFilter(List<Region> regions, QueryBuilder builder) {
if (regions != null && !regions.isEmpty()) {
DBObject[] objects = new DBObject[regions.size()];
int i = 0;
for (Region region : regions) {
DBObject regionObject = new BasicDBObject();
// if (region.getEnd() - region.getStart() < 1000000) {
// List<String> chunkIds = getChunkIds(region);
// regionObject.put(DocumentToVariantConverter.AT_FIELD + '.' + DocumentToVariantConverter
// .CHUNK_IDS_FIELD,
// new Document("$in", chunkIds));
// } else {
// regionObject.put(DocumentToVariantConverter.CHROMOSOME_FIELD, region.getChromosome());
// }
int end = region.getEnd();
if (end < Integer.MAX_VALUE) { // Avoid overflow
end++;
}
regionObject.put("_id", new Document()
.append("$gte", VariantStringIdConverter.buildId(region.getChromosome(), region.getStart()))
.append("$lt", VariantStringIdConverter.buildId(region.getChromosome(), end)));
objects[i] = regionObject;
i++;
}
builder.or(objects);
}
return builder;
}
/* Query util methods */
/**
* Parses the string to integer number.
* <p>
* Returns null if the string was not an integer.
*/
private Integer parseInteger(String string) {
Integer integer;
try {
integer = Integer.parseInt(string);
} catch (NumberFormatException ignored) {
integer = null;
}
return integer;
}
public void createIndexes(QueryOptions options) {
createIndexes(options, variantsCollection);
}
/**
* Create missing indexes on the given VariantsCollection.
* Variant indices
* - ChunkID
* - Chromosome + start + end
* - IDs
* <p>
* Study indices
* - StudyId + FileId
* <p>
* Stats indices
* - StatsMaf
* - StatsMgf
* <p>
* Annotation indices
* - XRef.id
* - ConsequenceType.so
* - _gn_so : SPARSE
* - PopulationFrequency Study + Population + AlternateFrequency : SPARSE
* - Clinical.Clinvar.clinicalSignificance : SPARSE
* ConservedRegionScore
* - phastCons.score
* - phylop.score
* - gerp.score
* FunctionalScore
* - cadd_scaled
* - cadd_raw
* - Drugs.name : SPARSE
* ProteinSubstitution
* - polyphen.score : SPARSE
* - polyphen.description : SPARSE
* - sift.score : SPARSE
* - sift.description : SPARSE
* - ProteinVariantAnnotation.keywords : SPARSE
* - TranscriptAnnotationFlags : SPARSE
*
* @param options Unused Options.
* @param variantsCollection MongoDBCollection
*/
public static void createIndexes(QueryOptions options, MongoDBCollection variantsCollection) {
logger.info("Start creating indexes");
ObjectMap onBackground = new ObjectMap(MongoDBCollection.BACKGROUND, true);
ObjectMap onBackgroundSparse = new ObjectMap(MongoDBCollection.BACKGROUND, true).append(MongoDBCollection.SPARSE, true);
// Variant indices
////////////////
variantsCollection.createIndex(new Document(DocumentToVariantConverter.AT_FIELD + '.'
+ DocumentToVariantConverter.CHUNK_IDS_FIELD, 1), onBackground);
variantsCollection.createIndex(new Document(DocumentToVariantConverter.CHROMOSOME_FIELD, 1)
.append(DocumentToVariantConverter.START_FIELD, 1)
.append(DocumentToVariantConverter.END_FIELD, 1), onBackground);
variantsCollection.createIndex(new Document(DocumentToVariantConverter.IDS_FIELD, 1), onBackground);
// Study indices
////////////////
variantsCollection.createIndex(new Document(DocumentToVariantConverter.STUDIES_FIELD
+ "." + DocumentToStudyVariantEntryConverter.STUDYID_FIELD, 1)
.append(DocumentToVariantConverter.STUDIES_FIELD
+ "." + DocumentToStudyVariantEntryConverter.FILES_FIELD
+ "." + DocumentToStudyVariantEntryConverter.FILEID_FIELD, 1), onBackground);
// Stats indices
////////////////
variantsCollection.createIndex(new Document(DocumentToVariantConverter.STATS_FIELD + "." + DocumentToVariantStatsConverter
.MAF_FIELD, 1), onBackground);
variantsCollection.createIndex(new Document(DocumentToVariantConverter.STATS_FIELD + "." + DocumentToVariantStatsConverter
.MGF_FIELD, 1), onBackground);
// Annotation indices
////////////////
// XRefs.id
variantsCollection.createIndex(new Document()
.append(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.XREFS_FIELD
+ "." + DocumentToVariantAnnotationConverter.XREF_ID_FIELD, 1),
onBackground);
// ConsequenceType.so
variantsCollection.createIndex(new Document()
.append(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_SO_ACCESSION_FIELD, 1),
onBackground);
// _gn_so : SPARSE
variantsCollection.createIndex(new Document()
.append(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.GENE_SO_FIELD, 1),
onBackgroundSparse);
// Population frequency : SPARSE
variantsCollection.createIndex(new Document()
.append(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.POPULATION_FREQUENCIES_FIELD
+ "." + DocumentToVariantAnnotationConverter.POPULATION_FREQUENCY_STUDY_FIELD, 1)
.append(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.POPULATION_FREQUENCIES_FIELD
+ "." + DocumentToVariantAnnotationConverter.POPULATION_FREQUENCY_POP_FIELD, 1)
.append(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.POPULATION_FREQUENCIES_FIELD
+ "." + DocumentToVariantAnnotationConverter.POPULATION_FREQUENCY_ALTERNATE_FREQUENCY_FIELD, 1),
onBackgroundSparse);
// Clinical clinvar : SPARSE
variantsCollection.createIndex(new Document()
.append(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CLINICAL_DATA_FIELD
+ "." + DocumentToVariantAnnotationConverter.CLINICAL_CLINVAR_FIELD
+ ".clinicalSignificance", 1),
onBackgroundSparse);
// Conserved region score (phastCons, phylop, gerp)
variantsCollection.createIndex(new Document(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSERVED_REGION_GERP_FIELD
+ "." + DocumentToVariantAnnotationConverter.SCORE_SCORE_FIELD, 1),
onBackground);
variantsCollection.createIndex(new Document(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSERVED_REGION_PHYLOP_FIELD
+ "." + DocumentToVariantAnnotationConverter.SCORE_SCORE_FIELD, 1),
onBackground);
variantsCollection.createIndex(new Document(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSERVED_REGION_PHASTCONS_FIELD
+ "." + DocumentToVariantAnnotationConverter.SCORE_SCORE_FIELD, 1),
onBackground);
// Functional score (cadd_scaled, cadd_raw)
variantsCollection.createIndex(new Document(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.FUNCTIONAL_CADD_SCALED_FIELD
+ "." + DocumentToVariantAnnotationConverter.SCORE_SCORE_FIELD, 1),
onBackground);
variantsCollection.createIndex(new Document(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.FUNCTIONAL_CADD_RAW_FIELD
+ "." + DocumentToVariantAnnotationConverter.SCORE_SCORE_FIELD, 1),
onBackground);
// Drugs : SPARSE
variantsCollection.createIndex(new Document()
.append(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.DRUG_FIELD
+ "." + DocumentToVariantAnnotationConverter.DRUG_NAME_FIELD, 1),
onBackgroundSparse);
// Protein substitution score (polyphen , sift) : SPARSE
variantsCollection.createIndex(new Document(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_PROTEIN_POLYPHEN_FIELD
+ "." + DocumentToVariantAnnotationConverter.SCORE_SCORE_FIELD, 1),
onBackgroundSparse);
variantsCollection.createIndex(new Document(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_PROTEIN_SIFT_FIELD
+ "." + DocumentToVariantAnnotationConverter.SCORE_SCORE_FIELD, 1),
onBackgroundSparse);
// Protein substitution score description (polyphen , sift) : SPARSE
variantsCollection.createIndex(new Document(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_PROTEIN_POLYPHEN_FIELD
+ "." + DocumentToVariantAnnotationConverter.SCORE_DESCRIPTION_FIELD, 1),
onBackgroundSparse);
variantsCollection.createIndex(new Document(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_PROTEIN_SIFT_FIELD
+ "." + DocumentToVariantAnnotationConverter.SCORE_DESCRIPTION_FIELD, 1),
onBackgroundSparse);
// Protein Keywords : SPARSE
variantsCollection.createIndex(new Document()
.append(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_PROTEIN_KEYWORDS, 1),
onBackgroundSparse);
// TranscriptAnnotationFlags : SPARSE
variantsCollection.createIndex(new Document()
.append(DocumentToVariantConverter.ANNOTATION_FIELD
+ "." + DocumentToVariantAnnotationConverter.CONSEQUENCE_TYPE_FIELD
+ "." + DocumentToVariantAnnotationConverter.CT_TRANSCRIPT_ANNOT_FLAGS, 1),
onBackgroundSparse);
logger.debug("sent order to create indices");
}
/**
* This method split a typical key-value param such as 'sift<=0.2' in an array ["sift", "<=0.2"].
* This implementation can and probably should be improved.
*
* @param keyValue The keyvalue parameter to be split
* @return An array with 2 positions for the key and value
* @deprecated use {@link VariantDBAdaptorUtils#splitOperator(String)}
*/
@Deprecated
private String[] splitKeyValue(String keyValue) {
Matcher matcher = OPERATION_PATTERN.matcher(keyValue);
if (!matcher.find()) {
return new String[]{keyValue};
} else {
return new String[]{matcher.group(1), matcher.group(2) + matcher.group(3)};
}
}
/**
* @deprecated use {@link VariantDBAdaptorUtils#splitOperator(String)}
*/
@Deprecated
private String[] splitKeyOpValue(String keyValue) {
Matcher matcher = OPERATION_PATTERN.matcher(keyValue);
if (!matcher.find()) {
return new String[]{keyValue};
} else {
return new String[]{matcher.group(1), matcher.group(2), matcher.group(3)};
}
}
/* *******************
* Auxiliary methods *
* *******************/
private List<String> getChunkIds(Region region) {
List<String> chunkIds = new LinkedList<>();
int chunkSize = (region.getEnd() - region.getStart() > VariantMongoDBWriter.CHUNK_SIZE_BIG)
? VariantMongoDBWriter.CHUNK_SIZE_BIG
: VariantMongoDBWriter.CHUNK_SIZE_SMALL;
int ks = chunkSize / 1000;
int chunkStart = region.getStart() / chunkSize;
int chunkEnd = region.getEnd() / chunkSize;
for (int i = chunkStart; i <= chunkEnd; i++) {
String chunkId = region.getChromosome() + "_" + i + "_" + ks + "k";
chunkIds.add(chunkId);
}
return chunkIds;
}
private int getChunkId(int position, int chunksize) {
return position / chunksize;
}
private int getChunkStart(int id, int chunksize) {
return (id == 0) ? 1 : id * chunksize;
}
private int getChunkEnd(int id, int chunksize) {
return (id * chunksize) + chunksize - 1;
}
@Override
public StudyConfigurationManager getStudyConfigurationManager() {
return studyConfigurationManager;
}
@Override
public VariantSourceMongoDBAdaptor getVariantSourceDBAdaptor() {
return variantSourceMongoDBAdaptor;
}
@Override
public void setStudyConfigurationManager(StudyConfigurationManager studyConfigurationManager) {
this.studyConfigurationManager = studyConfigurationManager;
}
@Override
public CellBaseClient getCellBaseClient() {
return cellBaseClient;
}
@Override
public VariantDBAdaptorUtils getDBAdaptorUtils() {
return utils;
}
public static List<Integer> getLoadedSamples(int fileId, StudyConfiguration studyConfiguration) {
List<Integer> loadedSampleIds = new LinkedList<>();
for (Integer indexedFile : studyConfiguration.getIndexedFiles()) {
if (indexedFile.equals(fileId)) {
continue;
} else {
loadedSampleIds.addAll(studyConfiguration.getSamplesInFiles().get(indexedFile));
}
}
loadedSampleIds.removeAll(studyConfiguration.getSamplesInFiles().get(fileId));
return loadedSampleIds;
}
public VariantMongoDBAdaptor setVariantSearchManager(VariantSearchManager variantSearchManager) {
this.variantSearchManager = variantSearchManager;
return this;
}
}