/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.mongodb.variant.load.stage; import com.mongodb.client.model.Sorts; import org.bson.Document; import org.bson.conversions.Bson; import org.opencb.biodata.models.variant.Variant; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.datastore.mongodb.MongoDBCollection; import org.opencb.commons.datastore.mongodb.MongoPersistentCursor; import org.opencb.commons.io.DataReader; import org.opencb.opencga.storage.mongodb.variant.converters.VariantStringIdConverter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; import static com.mongodb.client.model.Filters.*; /** * DataReader for Variant stage collection. * Given a MongoDBCollection and a studyId, iterates over the collection * returning sorted results. * * Created on 13/04/16. * * @author Jacobo Coll <jacobo167@gmail.com> */ public class MongoDBVariantStageReader implements DataReader<Document> { private final MongoDBCollection stageCollection; private final int studyId; private final Collection<String> chromosomes; private MongoPersistentCursor iterator; private Document next = null; // Pending variant private final Logger logger = LoggerFactory.getLogger(MongoDBVariantStageReader.class); public MongoDBVariantStageReader(MongoDBCollection stageCollection, int studyId) { this.stageCollection = stageCollection; this.studyId = studyId; this.chromosomes = Collections.emptyList(); } public MongoDBVariantStageReader(MongoDBCollection stageCollection, int studyId, Collection<String> chromosomes) { this.stageCollection = stageCollection; this.studyId = studyId; this.chromosomes = chromosomes == null ? Collections.emptyList() : chromosomes; } public long countNumVariants() { return stageCollection.nativeQuery().count(getQuery()); } public long countAproxNumVariants() { return stageCollection.count().first(); } @Override public boolean open() { //Filter documents with the selected studyId and chromosomes //Sorting by _id // FindIterable<Document> iterable = stageCollection.nativeQuery().find(getQuery(), // new QueryOptions(QueryOptions.SORT, Sorts.ascending("_id")) // ); // iterable.batchSize(20); // this.iterator = iterable.iterator(); QueryOptions options = new QueryOptions(QueryOptions.SORT, Sorts.ascending("_id")); iterator = new MongoPersistentCursor(stageCollection, getQuery(), null, options) .setBatchSize(20); return true; } protected Bson getQuery() { ArrayList<Bson> chrFilters = new ArrayList<>(chromosomes.size()); for (String chromosome : chromosomes) { addChromosomeFilter(chrFilters, chromosome); } Bson bson; if (chrFilters.isEmpty()) { bson = exists(Integer.toString(studyId)); } else { bson = and(exists(Integer.toString(studyId)), or(chrFilters)); // Be in any of these chromosomes } logger.debug("stage filter: " + bson.toBsonDocument(Document.class, com.mongodb.MongoClient.getDefaultCodecRegistry())); return bson; } public static void addChromosomeFilter(List<Bson> chrFilters, String chromosome) { if (chromosome == null || chromosome.isEmpty()) { return; } chromosome = VariantStringIdConverter.convertChromosome(chromosome); chrFilters.add(and( gte("_id", chromosome + VariantStringIdConverter.SEPARATOR_CHAR), lt("_id", chromosome + (char) (VariantStringIdConverter.SEPARATOR_CHAR + 1)))); } @Override public List<Document> read(int b) { List<Document> list = new ArrayList<>(b); // If there were some pending variant, add to the list. Document last = next; if (next != null) { list.add(next); next = null; } for (int i = list.size(); i < b; i++) { if (iterator.hasNext()) { last = iterator.next(); list.add(last); } } if (iterator.hasNext()) { // Obtain the LastVariant from the read LastDocument Variant lastVar = MongoDBVariantStageLoader.STRING_ID_CONVERTER.convertToDataModelType(last); int start = lastVar.getStart(); int end = lastVar.getEnd(); String chr = lastVar.getChromosome(); while (iterator.hasNext()) { // Get the next document. Check if this should be in the current batch. // If not, will be added as the first element of the next batch next = iterator.next(); Variant nextVar = MongoDBVariantStageLoader.STRING_ID_CONVERTER.convertToDataModelType(next); // If the last and next variants overlaps, add next to the batch. if (nextVar.overlapWith(chr, start, end, true)) { list.add(next); logger.debug("Add overlapping variant last: {}, next: {}", lastVar, nextVar); // Adding next to the batch, next is the new last. last = next; lastVar = nextVar; start = Math.min(start, nextVar.getStart()); end = Math.max(end, nextVar.getEnd()); next = null; } else { // If they are not overlapped, stop looping. break; } } } return list; } @Override public boolean close() { iterator.close(); return true; } }