/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.mongodb.variant;
import com.google.common.collect.BiMap;
import com.google.common.collect.LinkedListMultimap;
import com.google.common.collect.ListMultimap;
import org.apache.commons.lang3.time.StopWatch;
import org.bson.Document;
import org.opencb.biodata.formats.variant.io.VariantReader;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantSource;
import org.opencb.biodata.models.variant.VariantStudy;
import org.opencb.biodata.models.variant.avro.VariantType;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.mongodb.MongoDBCollection;
import org.opencb.commons.run.ParallelTaskRunner;
import org.opencb.opencga.core.common.ProgressLogger;
import org.opencb.opencga.storage.core.config.StorageConfiguration;
import org.opencb.opencga.storage.core.exceptions.StorageEngineException;
import org.opencb.opencga.storage.core.metadata.BatchFileOperation;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager;
import org.opencb.opencga.storage.core.variant.VariantStoragePipeline;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor;
import org.opencb.opencga.storage.core.variant.adaptors.VariantSourceDBAdaptor;
import org.opencb.opencga.storage.core.variant.io.VariantReaderUtils;
import org.opencb.opencga.storage.mongodb.variant.adaptors.VariantMongoDBAdaptor;
import org.opencb.opencga.storage.mongodb.variant.converters.DocumentToSamplesConverter;
import org.opencb.opencga.storage.mongodb.variant.exceptions.MongoVariantStorageEngineException;
import org.opencb.opencga.storage.mongodb.variant.load.MongoDBVariantWriteResult;
import org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageConverterTask;
import org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageLoader;
import org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageReader;
import org.opencb.opencga.storage.mongodb.variant.load.variants.MongoDBOperations;
import org.opencb.opencga.storage.mongodb.variant.load.variants.MongoDBVariantMergeLoader;
import org.opencb.opencga.storage.mongodb.variant.load.variants.MongoDBVariantMerger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URI;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Predicate;
import static org.opencb.opencga.storage.core.variant.VariantStorageEngine.Options;
import static org.opencb.opencga.storage.mongodb.variant.MongoDBVariantStorageEngine.MongoDBVariantOptions.*;
/**
* Created on 30/03/16.
*
* @author Jacobo Coll <jacobo167@gmail.com>
*/
public class MongoDBVariantStoragePipeline extends VariantStoragePipeline {
private final VariantMongoDBAdaptor dbAdaptor;
private final ObjectMap loadStats = new ObjectMap();
public MongoDBVariantStoragePipeline(StorageConfiguration configuration, String storageEngineId,
VariantMongoDBAdaptor dbAdaptor) {
super(configuration, storageEngineId, LoggerFactory.getLogger(MongoDBVariantStoragePipeline.class), dbAdaptor,
new VariantReaderUtils());
this.dbAdaptor = dbAdaptor;
}
public URI preLoad(URI input, URI output) throws StorageEngineException {
URI uri = super.preLoad(input, output);
if (isResumeStage(options)) {
logger.info("Resume stage load.");
// Clean stage collection?
}
return uri;
}
@Override
protected void securePreLoad(StudyConfiguration studyConfiguration, VariantSource source) throws StorageEngineException {
super.securePreLoad(studyConfiguration, source);
int fileId = options.getInt(Options.FILE_ID.key());
if (studyConfiguration.getAttributes().containsKey(DEFAULT_GENOTYPE.key())) {
Set<String> defaultGenotype = new HashSet<>(studyConfiguration.getAttributes().getAsStringList(DEFAULT_GENOTYPE.key()));
logger.debug("Using default genotype from study configuration: {}", defaultGenotype);
} else {
Set<String> defaultGenotype;
if (options.containsKey(DEFAULT_GENOTYPE.key())) {
defaultGenotype = new HashSet<>(options.getAsStringList(DEFAULT_GENOTYPE.key()));
} else {
VariantStudy.StudyType studyType = options.get(Options.STUDY_TYPE.key(), VariantStudy.StudyType.class, Options.STUDY_TYPE
.defaultValue());
switch (studyType) {
case FAMILY:
case TRIO:
case PAIRED:
case PAIRED_TUMOR:
defaultGenotype = Collections.singleton(DocumentToSamplesConverter.UNKNOWN_GENOTYPE);
logger.debug("Do not compress genotypes. Default genotype : {}", defaultGenotype);
break;
default:
defaultGenotype = new HashSet<>(DEFAULT_GENOTYPE.defaultValue());
logger.debug("No default genotype found. Using default genotype: {}", defaultGenotype);
break;
}
}
studyConfiguration.getAttributes().put(DEFAULT_GENOTYPE.key(), defaultGenotype);
}
boolean newSampleBatch = checkCanLoadSampleBatch(studyConfiguration, fileId);
if (newSampleBatch) {
logger.info("New sample batch!!!");
//TODO: Check if there are regions with gaps
// ArrayList<Integer> indexedFiles = new ArrayList<>(studyConfiguration.getIndexedFiles());
// if (!indexedFiles.isEmpty()) {
// LinkedHashSet<Integer> sampleIds = studyConfiguration.getSamplesInFiles().get(indexedFiles.get(indexedFiles.size() - 1));
// if (!sampleIds.isEmpty()) {
// Integer sampleId = sampleIds.iterator().next();
// String files = "";
// for (Integer indexedFileId : indexedFiles) {
// if (studyConfiguration.getSamplesInFiles().get(indexedFileId).contains(sampleId)) {
// files += "!" + indexedFileId + ";";
// }
// }
//// String genotypes = sampleIds.stream().map(i -> studyConfiguration.getSampleIds().inverse().get(i) + ":" +
// DBObjectToSamplesConverter.UNKNOWN_GENOTYPE).collect(Collectors.joining(","));
// String genotypes = sampleId + ":" + DBObjectToSamplesConverter.UNKNOWN_GENOTYPE;
// Long v = getDBAdaptor(null).count(new Query()
// .append(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), studyConfiguration.getStudyId())
// .append(VariantDBAdaptor.VariantQueryParams.FILES.key(), files)
// .append(VariantDBAdaptor.VariantQueryParams.GENOTYPE.key(), genotypes)).first();
// }
// }
}
boolean doMerge = options.getBoolean(MERGE.key(), false);
boolean doStage = options.getBoolean(STAGE.key(), false);
if (!doMerge && !doStage) {
doMerge = true;
doStage = true;
}
options.put(MERGE.key(), doMerge);
options.put(STAGE.key(), doStage);
securePreStage(fileId, studyConfiguration);
// QueryResult<Long> countResult = dbAdaptor.count(new Query(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), studyConfiguration
// .getStudyId())
// .append(VariantDBAdaptor.VariantQueryParams.FILES.key(), fileId));
// Long count = countResult.first();
// if (count != 0) {
// logger.warn("Resume mode. There are already loaded variants from the file "
// + studyConfiguration.getFileIds().inverse().get(fileId) + " : " + fileId + " ");
// options.put(ALREADY_LOADED_VARIANTS.key(), count);
// }
}
@Override
public URI load(URI inputUri) throws IOException, StorageEngineException {
// boolean includeSamples = options.getBoolean(Options.INCLUDE_GENOTYPES.key(), Options.INCLUDE_GENOTYPES.defaultValue());
// boolean includeStats = options.getBoolean(Options.INCLUDE_STATS.key(), Options.INCLUDE_STATS.defaultValue());
// boolean includeSrc = options.getBoolean(Options.INCLUDE_SRC.key(), Options.INCLUDE_SRC.defaultValue());
// boolean compressGenotypes = options.getBoolean(Options.COMPRESS_GENOTYPES.key(), false);
// boolean compressGenotypes = defaultGenotype != null && !defaultGenotype.isEmpty();
boolean doMerge = options.getBoolean(MERGE.key(), false);
boolean doStage = options.getBoolean(STAGE.key(), false);
final int fileId = options.getInt(Options.FILE_ID.key());
logger.info("Loading variants...");
long start = System.currentTimeMillis();
if (doStage) {
stage(inputUri);
}
long skippedVariants = options.getLong("skippedVariants");
if (doMerge) {
MongoDBVariantWriteResult writeResult = merge(Collections.singletonList(fileId), skippedVariants);
}
long end = System.currentTimeMillis();
logger.info("end - start = " + (end - start) / 1000.0 + "s");
logger.info("Variants loaded!");
return inputUri; //TODO: Return something like this: mongo://<host>/<dbName>/<collectionName>
}
public void stage(URI inputUri) throws StorageEngineException {
final int fileId = options.getInt(Options.FILE_ID.key());
if (!options.getBoolean(STAGE.key(), false)) {
// Do not stage!
return;
}
Path input = Paths.get(inputUri.getPath());
VariantSource source = readVariantSource(inputUri, null);
int numRecords = source.getStats().getNumRecords();
int batchSize = options.getInt(Options.LOAD_BATCH_SIZE.key(), Options.LOAD_BATCH_SIZE.defaultValue());
int bulkSize = options.getInt(BULK_SIZE.key(), batchSize);
int loadThreads = options.getInt(Options.LOAD_THREADS.key(), Options.LOAD_THREADS.defaultValue());
final int numReaders = 1;
// final int numTasks = loadThreads == 1 ? 1 : loadThreads - numReaders; //Subtract the reader thread
MongoDBCollection stageCollection = dbAdaptor.getStageCollection();
try {
StudyConfiguration studyConfiguration = getStudyConfiguration();
//Reader
VariantReader variantReader;
variantReader = VariantReaderUtils.getVariantReader(input, source);
//Remapping ids task
String fileIdStr = options.getString(Options.FILE_ID.key());
ParallelTaskRunner.Task<Variant, Variant> remapIdsTask = batch -> {
batch.forEach(variant -> variant.getStudies()
.forEach(studyEntry -> {
studyEntry.setStudyId(Integer.toString(studyConfiguration.getStudyId()));
studyEntry.getFiles().forEach(fileEntry -> fileEntry.setFileId(fileIdStr));
}));
return batch;
};
//Runner
ProgressLogger progressLogger = new ProgressLogger("Write variants in STAGE collection:", numRecords, 200);
MongoDBVariantStageConverterTask converterTask = new MongoDBVariantStageConverterTask(progressLogger);
MongoDBVariantStageLoader stageLoader =
new MongoDBVariantStageLoader(stageCollection, studyConfiguration.getStudyId(), fileId,
isResumeStage(options));
ParallelTaskRunner<Variant, ?> ptr;
ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder()
.setReadQueuePutTimeout(20 * 60)
.setNumTasks(loadThreads)
.setBatchSize(batchSize)
.setAbortOnFail(true).build();
if (options.getBoolean(STAGE_PARALLEL_WRITE.key(), STAGE_PARALLEL_WRITE.defaultValue())) {
logger.info("Multi thread stage load... [{} readerThreads, {} writerThreads]", numReaders, loadThreads);
ptr = new ParallelTaskRunner<>(variantReader, remapIdsTask.then(converterTask).then(stageLoader), null, config);
} else {
logger.info("Multi thread stage load... [{} readerThreads, {} tasks, {} writerThreads]", numReaders, loadThreads, 1);
ptr = new ParallelTaskRunner<>(variantReader, remapIdsTask.then(converterTask), stageLoader, config);
}
Thread hook = new Thread(() -> {
try {
logger.error("Stage shutdown hook!");
stageError();
} catch (StorageEngineException e) {
logger.error("Error at shutdown", e);
throw new RuntimeException(e);
}
});
try {
Runtime.getRuntime().addShutdownHook(hook);
ptr.run();
stageSuccess(source);
} finally {
Runtime.getRuntime().removeShutdownHook(hook);
}
long skippedVariants = converterTask.getSkippedVariants();
stageLoader.getWriteResult().setSkippedVariants(skippedVariants);
loadStats.append(MERGE.key(), false);
loadStats.append("stageWriteResult", stageLoader.getWriteResult());
options.put("skippedVariants", skippedVariants);
logger.info("Stage Write result: {}", skippedVariants);
} catch (ExecutionException | RuntimeException e) {
try {
stageError();
} catch (Exception e2) {
// Do not propagate this exception!
logger.error("Error reporting stage error!", e2);
}
throw new StorageEngineException("Error while executing STAGE variants", e);
}
}
/**
* Check can stage this file.
*
* - The file is not staged
* - The file is not being staged
*
*/
private BatchFileOperation preStage(int fileId) throws StorageEngineException {
StudyConfigurationManager scm = dbAdaptor.getStudyConfigurationManager();
AtomicReference<BatchFileOperation> operation = new AtomicReference<>();
scm.lockAndUpdate(getStudyId(), studyConfiguration -> {
operation.set(securePreStage(fileId, studyConfiguration));
return studyConfiguration;
});
return operation.get();
}
private BatchFileOperation securePreStage(int fileId, StudyConfiguration studyConfiguration) throws StorageEngineException {
String fileName = studyConfiguration.getFileIds().inverse().get(fileId);
Query query = new Query()
.append(VariantSourceDBAdaptor.VariantSourceQueryParam.STUDY_ID.key(), studyConfiguration.getStudyId())
.append(VariantSourceDBAdaptor.VariantSourceQueryParam.FILE_ID.key(), fileId);
Iterator<VariantSource> iterator = dbAdaptor.getVariantSourceDBAdaptor().iterator(query, new QueryOptions());
boolean loadStageResume = false;
boolean stage = true;
BatchFileOperation operation = getBatchFileOperation(studyConfiguration.getBatches(),
op -> op.getOperationName().equals(STAGE.key())
&& op.getFileIds().equals(Collections.singletonList(fileId)));
if (iterator.hasNext()) {
// Already indexed!
logger.info("File \"{}\" ({}) already staged!", fileName, fileId);
stage = false;
if (operation != null && !operation.currentStatus().equals(BatchFileOperation.Status.READY)) {
// There was an error writing the operation status. Restore to "READY"
operation.addStatus(BatchFileOperation.Status.READY);
}
} else {
loadStageResume = isResumeStage(options);
if (operation != null) {
switch (operation.currentStatus()) {
case READY:
// Already indexed!
// TODO: Believe this ready? What if deleted?
logger.info("File \"{}\" ({}) already staged!", fileName, fileId);
stage = false;
//dbAdaptor.getVariantSourceDBAdaptor().updateVariantSource(source);
break;
case RUNNING:
if (!loadStageResume) {
throw MongoVariantStorageEngineException.fileBeingStagedException(fileId, fileName);
}
case ERROR:
// Resume stage
loadStageResume = true;
options.put(STAGE_RESUME.key(), true);
break;
default:
throw new IllegalStateException("Unknown status: " + operation.currentStatus());
}
} else {
operation = new BatchFileOperation(STAGE.key(), Collections.singletonList(fileId), System.currentTimeMillis(),
BatchFileOperation.Type.OTHER);
studyConfiguration.getBatches().add(operation);
}
if (stage) {
operation.addStatus(Calendar.getInstance().getTime(), BatchFileOperation.Status.RUNNING);
}
}
if (stage) {
BatchFileOperation mergeOperation = getBatchFileOperation(studyConfiguration.getBatches(),
op -> op.getOperationName().equals(MERGE.key()) && !op.currentStatus().equals(BatchFileOperation.Status.READY));
if (mergeOperation != null) {
// Avoid stage new files if there are ongoing merge operations
throw MongoVariantStorageEngineException.operationInProgressException(mergeOperation);
}
}
options.put(STAGE.key(), stage);
return operation;
}
private BatchFileOperation getBatchFileOperation(List<BatchFileOperation> batches, Predicate<BatchFileOperation> filter) {
for (int i = batches.size() - 1; i >= 0; i--) {
BatchFileOperation op = batches.get(i);
if (filter.test(op)) {
return op;
}
}
return null;
}
public void stageError() throws StorageEngineException {
int fileId = options.getInt(Options.FILE_ID.key());
setStatus(BatchFileOperation.Status.ERROR, STAGE.key(), Collections.singletonList(fileId));
}
public void stageSuccess(VariantSource source) throws StorageEngineException {
// Stage loading finished. Save VariantSource and update BatchOperation
source.setFileId(options.getString(Options.FILE_ID.key()));
source.setStudyId(options.getString(Options.STUDY_ID.key()));
setStatus(BatchFileOperation.Status.READY, STAGE.key(), Collections.singletonList(options.getInt(Options.FILE_ID.key())));
dbAdaptor.getVariantSourceDBAdaptor().updateVariantSource(source);
}
/**
* Merge staged files into Variant collection.
*
* @param fileIds FileIDs of the files to be merged
* @return Write Result with times and count
* @throws StorageEngineException If there is a problem executing the {@link ParallelTaskRunner}
*/
public MongoDBVariantWriteResult merge(List<Integer> fileIds) throws StorageEngineException {
return merge(fileIds, options.getInt("skippedVariants", 0));
}
/**
* Merge staged files into Variant collection.
*
* 1- Find if the files are in different chromosomes.
* 2- If splitted, call once per chromosome. Else, call only once.
*
* @see MongoDBVariantMerger
*
* @param fileIds FileIDs of the files to be merged
* @param skippedVariants Number of skipped variants into the Stage
* @return Write Result with times and count
* @throws StorageEngineException If there is a problem executing the {@link ParallelTaskRunner}
*/
protected MongoDBVariantWriteResult merge(List<Integer> fileIds, long skippedVariants)
throws StorageEngineException {
long start = System.currentTimeMillis();
options.put(Options.FILE_ID.key(), fileIds);
StudyConfiguration studyConfiguration = preMerge(fileIds);
//Stage collection where files are loaded.
MongoDBCollection stageCollection = dbAdaptor.getStageCollection();
int batchSize = options.getInt(Options.LOAD_BATCH_SIZE.key(), Options.LOAD_BATCH_SIZE.defaultValue());
int loadThreads = options.getInt(Options.LOAD_THREADS.key(), Options.LOAD_THREADS.defaultValue());
int capacity = options.getInt("blockingQueueCapacity", loadThreads * 2);
//Iterate over all the files
Query query = new Query(VariantSourceDBAdaptor.VariantSourceQueryParam.STUDY_ID.key(), studyConfiguration.getStudyId());
Iterator<VariantSource> iterator = dbAdaptor.getVariantSourceDBAdaptor().iterator(query, null);
// List of chromosomes to be loaded
Set<String> chromosomesToLoad = new HashSet<>();
// List of all the indexed files that cover each chromosome
ListMultimap<String, Integer> chromosomeInLoadedFiles = LinkedListMultimap.create();
// List of all the indexed files that cover each chromosome
ListMultimap<String, Integer> chromosomeInFilesToLoad = LinkedListMultimap.create();
Set<String> wholeGenomeFiles = new HashSet<>();
Set<String> byChromosomeFiles = new HashSet<>();
while (iterator.hasNext()) {
VariantSource variantSource = iterator.next();
int fileId = Integer.parseInt(variantSource.getFileId());
// If the file is going to be loaded, check if covers just one chromosome
if (fileIds.contains(fileId)) {
if (variantSource.getStats().getChromosomeCounts().size() == 1) {
chromosomesToLoad.addAll(variantSource.getStats().getChromosomeCounts().keySet());
byChromosomeFiles.add(variantSource.getFileName());
} else {
wholeGenomeFiles.add(variantSource.getFileName());
}
}
// If the file is indexed, add to the map of chromosome->fileId
for (String chromosome : variantSource.getStats().getChromosomeCounts().keySet()) {
if (studyConfiguration.getIndexedFiles().contains(fileId)) {
chromosomeInLoadedFiles.put(chromosome, fileId);
} else if (fileIds.contains(fileId)) {
chromosomeInFilesToLoad.put(chromosome, fileId);
} // else { ignore files that are not loaded, and are not going to be loaded }
}
}
final MongoDBVariantWriteResult writeResult;
if (options.getBoolean(MERGE_SKIP.key())) {
// It was already merged, but still some work is needed. Exit to do postLoad step
writeResult = new MongoDBVariantWriteResult();
} else {
Thread hook = new Thread(() -> {
try {
logger.error("Merge shutdown hook!");
setStatus(BatchFileOperation.Status.ERROR, MERGE.key(), fileIds);
} catch (StorageEngineException e) {
logger.error("Failed setting status '" + MERGE.key() + "' operation over files " + fileIds
+ " to '" + BatchFileOperation.Status.ERROR + '\'', e);
throw new RuntimeException(e);
}
});
Runtime.getRuntime().addShutdownHook(hook);
try {
if (!wholeGenomeFiles.isEmpty() && !byChromosomeFiles.isEmpty()) {
String message = "Impossible to merge files splitted and not splitted by chromosome at the same time! "
+ "Files covering only one chromosome: " + byChromosomeFiles + ". "
+ "Files covering more than one chromosome: " + wholeGenomeFiles;
logger.error(message);
throw new StorageEngineException(message);
}
if (chromosomesToLoad.isEmpty()) {
writeResult = mergeByChromosome(fileIds, batchSize, loadThreads, stageCollection,
studyConfiguration, null, studyConfiguration.getIndexedFiles());
} else {
writeResult = new MongoDBVariantWriteResult();
for (String chromosome : chromosomesToLoad) {
List<Integer> filesToLoad = chromosomeInFilesToLoad.get(chromosome);
Set<Integer> indexedFiles = new HashSet<>(chromosomeInLoadedFiles.get(chromosome));
MongoDBVariantWriteResult aux = mergeByChromosome(filesToLoad, batchSize, loadThreads, stageCollection,
studyConfiguration, chromosome, indexedFiles);
writeResult.merge(aux);
}
}
} catch (Exception e) {
setStatus(BatchFileOperation.Status.ERROR, MERGE.key(), fileIds);
throw e;
} finally {
Runtime.getRuntime().removeShutdownHook(hook);
}
setStatus(BatchFileOperation.Status.DONE, MERGE.key(), fileIds);
}
if (!options.getBoolean(STAGE_CLEAN_WHILE_LOAD.key(), STAGE_CLEAN_WHILE_LOAD.defaultValue())) {
StopWatch time = StopWatch.createStarted();
logger.info("Deleting variant records from Stage collection");
long modifiedCount = MongoDBVariantStageLoader.cleanStageCollection(stageCollection, studyConfiguration.getStudyId(), fileIds,
chromosomesToLoad, writeResult);
logger.info("Delete variants time: " + time.getTime(TimeUnit.MILLISECONDS) / 1000.0 + "s , CleanDocuments: " + modifiedCount);
}
writeResult.setSkippedVariants(skippedVariants);
logger.info("Write result: {}", writeResult.toString());
// logger.info("Write result: {}", writeResult.toTSV());
logger.info("Write result: {}", writeResult.toJson());
options.put("writeResult", writeResult);
loadStats.append(MERGE.key(), true);
loadStats.append("mergeWriteResult", writeResult);
long end = System.currentTimeMillis();
logger.info("end - start = " + (end - start) / 1000.0 + "s");
logger.info("Variants merged!");
return writeResult;
}
private StudyConfiguration preMerge(List<Integer> fileIds) throws StorageEngineException {
int studyId = getStudyId();
Set<Integer> fileIdsSet = new HashSet<>(fileIds);
return dbAdaptor.getStudyConfigurationManager().lockAndUpdate(studyId, studyConfiguration -> {
for (Integer fileId : fileIds) {
if (studyConfiguration.getIndexedFiles().contains(fileId)) {
throw StorageEngineException.alreadyLoaded(fileId, studyConfiguration);
}
}
boolean loadMergeResume = isResumeMerge(options);
List<BatchFileOperation> batches = studyConfiguration.getBatches();
BatchFileOperation operation = null;
for (int i = batches.size() - 1; i >= 0; i--) {
BatchFileOperation op = batches.get(i);
if (op.getOperationName().equals(MERGE.key())
&& fileIds.size() == op.getFileIds().size()
&& fileIdsSet.containsAll(op.getFileIds())) {
switch (op.currentStatus()) {
case READY:// Already indexed!
// TODO: Believe this ready? What if deleted?
// It was not "indexed" so suppose "deleted" ?
break;
case DONE:
// Already merged but still needs some work.
logger.info("Files " + fileIds + " where already merged, but where not marked as indexed files.");
options.put(MERGE_SKIP.key(), true);
case RUNNING:
if (!loadMergeResume) {
throw MongoVariantStorageEngineException.filesBeingMergedException(fileIds);
}
break;
case ERROR:
// Resume merge
loadMergeResume = true;
options.put(MERGE_RESUME.key(), loadMergeResume);
break;
default:
throw new IllegalStateException("Unknown status: " + op.currentStatus());
}
operation = op;
break;
} else {
// Can not merge any file if there is an ongoing MERGE or STAGE operation
if (op.getOperationName().equals(MERGE.key()) || op.getOperationName().equals(STAGE.key())) {
if (!op.currentStatus().equals(BatchFileOperation.Status.READY)) {
throw MongoVariantStorageEngineException.operationInProgressException(op);
}
}
}
}
if (operation == null) {
operation = new BatchFileOperation(MERGE.key(), fileIds, System.currentTimeMillis(), BatchFileOperation.Type.LOAD);
studyConfiguration.getBatches().add(operation);
operation.addStatus(Calendar.getInstance().getTime(), BatchFileOperation.Status.RUNNING);
} else if (operation.currentStatus() == BatchFileOperation.Status.ERROR) {
// Only set to RUNNING if it was on ERROR
operation.addStatus(Calendar.getInstance().getTime(), BatchFileOperation.Status.RUNNING);
}
return studyConfiguration;
});
}
private MongoDBVariantWriteResult mergeByChromosome(
List<Integer> fileIds, int batchSize, int loadThreads, MongoDBCollection stageCollection,
StudyConfiguration studyConfiguration, String chromosomeToLoad, Set<Integer> indexedFiles)
throws StorageEngineException {
MongoDBVariantStageReader reader = new MongoDBVariantStageReader(stageCollection, studyConfiguration.getStudyId(),
chromosomeToLoad == null ? Collections.emptyList() : Collections.singletonList(chromosomeToLoad));
boolean resume = isResumeMerge(options);
boolean cleanWhileLoading = options.getBoolean(STAGE_CLEAN_WHILE_LOAD.key(), STAGE_CLEAN_WHILE_LOAD.defaultValue());
ProgressLogger progressLogger = new ProgressLogger("Write variants in VARIANTS collection:", reader::countNumVariants, 200);
progressLogger.setApproximateTotalCount(reader.countAproxNumVariants());
boolean ignoreOverlapping = options.getBoolean(MERGE_IGNORE_OVERLAPPING_VARIANTS.key(),
MERGE_IGNORE_OVERLAPPING_VARIANTS.defaultValue());
MongoDBVariantMerger variantMerger = new MongoDBVariantMerger(dbAdaptor, studyConfiguration, fileIds, indexedFiles, resume,
ignoreOverlapping);
MongoDBVariantMergeLoader variantLoader = new MongoDBVariantMergeLoader(dbAdaptor.getVariantsCollection(), stageCollection,
studyConfiguration.getStudyId(), fileIds, resume, cleanWhileLoading, progressLogger);
ParallelTaskRunner<Document, MongoDBOperations> ptrMerge;
ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder()
.setReadQueuePutTimeout(20 * 60)
.setNumTasks(loadThreads)
.setBatchSize(batchSize)
.setAbortOnFail(true).build();
try {
if (options.getBoolean(MERGE_PARALLEL_WRITE.key(), MERGE_PARALLEL_WRITE.defaultValue())) {
ptrMerge = new ParallelTaskRunner<>(reader, variantMerger.then(variantLoader), null, config);
} else {
ptrMerge = new ParallelTaskRunner<>(reader, variantMerger, variantLoader, config);
}
} catch (RuntimeException e) {
throw new StorageEngineException("Error while creating ParallelTaskRunner", e);
}
try {
if (chromosomeToLoad != null) {
logger.info("Merging files {} in chromosome: {}. Other indexed files in chromosome {}: {}",
fileIds, chromosomeToLoad, chromosomeToLoad, indexedFiles);
} else {
logger.info("Merging files " + fileIds);
}
ptrMerge.run();
} catch (ExecutionException e) {
logger.info("Write result: {}", variantLoader.getResult());
throw new StorageEngineException("Error while executing LoadVariants in ParallelTaskRunner", e);
}
return variantLoader.getResult();
}
@Override
public URI postLoad(URI input, URI output) throws StorageEngineException {
if (options.getBoolean(MERGE.key())) {
return super.postLoad(input, output);
} else {
return input;
}
}
@Override
public void securePostLoad(List<Integer> fileIds, StudyConfiguration studyConfiguration) throws StorageEngineException {
super.securePostLoad(fileIds, studyConfiguration);
BatchFileOperation.Status status = secureSetStatus(studyConfiguration, BatchFileOperation.Status.READY, MERGE.key(), fileIds);
if (status != BatchFileOperation.Status.DONE) {
logger.warn("Unexpected status " + status);
}
}
@Override
public ObjectMap getLoadStats() {
return loadStats;
}
@Override
protected void checkLoadedVariants(URI input, List<Integer> fileIds, StudyConfiguration studyConfiguration)
throws StorageEngineException {
if (fileIds.size() == 1) {
checkLoadedVariants(input, fileIds.get(0), studyConfiguration);
} else {
// FIXME: Check variants in this situation!
logger.warn("Skip check loaded variants");
}
}
@Override
protected void checkLoadedVariants(URI input, int fileId, StudyConfiguration studyConfiguration) throws
StorageEngineException {
VariantSource variantSource = VariantReaderUtils.readVariantSource(Paths.get(input.getPath()), null);
// VariantMongoDBAdaptor dbAdaptor = getDBAdaptor(options.getString(VariantStorageEngine.Options.DB_NAME.key()));
Long count = dbAdaptor.count(new Query()
.append(VariantDBAdaptor.VariantQueryParams.FILES.key(), fileId)
.append(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), studyConfiguration.getStudyId())).first();
Long overlappedCount = dbAdaptor.count(new Query()
.append(VariantDBAdaptor.VariantQueryParams.FILES.key(), -fileId)
.append(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), studyConfiguration.getStudyId())).first();
long variantsToLoad = 0;
long expectedSkippedVariants = 0;
int symbolicVariants = 0;
int nonVariants = 0;
long alreadyLoadedVariants = options.getLong(ALREADY_LOADED_VARIANTS.key(), 0L);
for (Map.Entry<String, Integer> entry : variantSource.getStats().getVariantTypeCounts().entrySet()) {
if (entry.getKey().equals(VariantType.SYMBOLIC.toString())) {
expectedSkippedVariants += entry.getValue();
symbolicVariants = entry.getValue();
} else if (entry.getKey().equals(VariantType.NO_VARIATION.toString())) {
expectedSkippedVariants += entry.getValue();
nonVariants = entry.getValue();
} else {
variantsToLoad += entry.getValue();
}
}
MongoDBVariantWriteResult writeResult = options.get("writeResult", MongoDBVariantWriteResult.class);
long expectedCount = variantsToLoad;
if (alreadyLoadedVariants != 0) {
writeResult.setNonInsertedVariants(writeResult.getNonInsertedVariants() - alreadyLoadedVariants);
}
if (writeResult.getNonInsertedVariants() != 0) {
expectedCount -= writeResult.getNonInsertedVariants();
}
if (writeResult.getOverlappedVariants() != 0) {
// Expect to find this file in all the overlapped variants
expectedCount += writeResult.getOverlappedVariants();
}
logger.info("============================================================");
logger.info("Check loaded file '" + variantSource.getFileName() + "' (" + fileId + ')');
if (expectedSkippedVariants != writeResult.getSkippedVariants()) {
logger.error("Wrong number of skipped variants. Expected " + expectedSkippedVariants + " and got " + writeResult
.getSkippedVariants());
} else if (writeResult.getSkippedVariants() > 0) {
logger.warn("There were " + writeResult.getSkippedVariants() + " skipped variants.");
if (symbolicVariants > 0) {
logger.info(" * Of which " + symbolicVariants + " are " + VariantType.SYMBOLIC.toString() + " variants.");
}
if (nonVariants > 0) {
logger.info(" * Of which " + nonVariants + " are " + VariantType.NO_VARIATION.toString() + " variants.");
}
}
if (writeResult.getNonInsertedVariants() != 0) {
logger.error("There were " + writeResult.getNonInsertedVariants() + " duplicated variants not inserted. ");
}
if (alreadyLoadedVariants != 0) {
logger.info("Resume mode. Previously loaded variants: " + alreadyLoadedVariants);
}
StorageEngineException exception = null;
if (expectedCount != (count + overlappedCount)) {
String message = "Wrong number of loaded variants. Expected: " + expectedCount + " and got: " + (count + overlappedCount)
+ " (" + count + " from file, " + overlappedCount + " overlapped)";
logger.error(message);
logger.error(" * Variants to load : " + variantsToLoad);
logger.error(" * Non Inserted (due to duplications) : " + writeResult.getNonInsertedVariants());
logger.error(" * Overlapped variants (extra insertions) : " + writeResult.getOverlappedVariants());
// exception = new StorageEngineException(message);
} else {
logger.info("Final number of loaded variants: " + count
+ (overlappedCount > 0 ? " + " + overlappedCount + " overlapped variants" : ""));
}
logger.info("============================================================");
if (exception != null) {
throw exception;
}
}
/* --------------------------------------- */
/* StudyConfiguration utils methods */
/* --------------------------------------- */
/**
* Check if the samples from the selected file can be loaded.
* Check if the samples from the selected file can be loaded.
* <p>
* MongoDB storage plugin is not able to load batches of samples in a unordered way.
* A batch of samples is a group of samples of any size. It may be composed of one or several VCF files, depending
* on whether it is split by region (horizontally) or not.
* All the files from the same batch must be loaded, before loading the next batch. If a new batch of
* samples begins to be loaded, it won't be possible to load other files from previous batches
* <p>
* The StudyConfiguration must be complete, with all the indexed files, and samples in files.
* Provided StudyConfiguration won't be modified
* Requirements:
* - All samples in file must be or loaded or not loaded
* - If all samples loaded, must match (same order and samples) with the last loaded file.
*
* @param studyConfiguration StudyConfiguration from the selected study
* @param fileId File to load
* @return Returns if this file represents a new batch of samples
* @throws StorageEngineException If there is any unaccomplished requirement
*/
public static boolean checkCanLoadSampleBatch(final StudyConfiguration studyConfiguration, int fileId) throws StorageEngineException {
LinkedHashSet<Integer> sampleIds = studyConfiguration.getSamplesInFiles().get(fileId);
if (!sampleIds.isEmpty()) {
boolean allSamplesRepeated = true;
boolean someSamplesRepeated = false;
BiMap<String, Integer> indexedSamples = StudyConfiguration.getIndexedSamples(studyConfiguration);
for (Integer sampleId : sampleIds) {
if (!indexedSamples.containsValue(sampleId)) {
allSamplesRepeated = false;
} else {
someSamplesRepeated = true;
}
}
if (allSamplesRepeated) {
ArrayList<Integer> indexedFiles = new ArrayList<>(studyConfiguration.getIndexedFiles());
if (!indexedFiles.isEmpty()) {
int lastIndexedFile = indexedFiles.get(indexedFiles.size() - 1);
//Check that are the same samples in the same order
if (!new ArrayList<>(studyConfiguration.getSamplesInFiles().get(lastIndexedFile)).equals(new ArrayList<>(sampleIds))) {
//ERROR
if (studyConfiguration.getSamplesInFiles().get(lastIndexedFile).containsAll(sampleIds)) {
throw new StorageEngineException("Unable to load this batch. Wrong samples order"); //TODO: Should it care?
} else {
throw new StorageEngineException("Unable to load this batch. Another sample batch has been loaded already.");
}
}
//Ok, the batch of samples matches with the last loaded batch of samples.
return false; // This is NOT a new batch of samples
}
} else if (someSamplesRepeated) {
throw new StorageEngineException("There was some already indexed samples, but not all of them. "
+ "Unable to load in Storage-MongoDB");
}
}
return true; // This is a new batch of samples
}
// @Override
// public void checkStudyConfiguration(StudyConfiguration studyConfiguration, VariantDBAdaptor dbAdaptor) throws
// StorageEngineException {
// super.checkStudyConfiguration(studyConfiguration, dbAdaptor);
// if (dbAdaptor == null) {
// logger.debug("Do not check StudyConfiguration against the loaded in MongoDB");
// } else {
// if (dbAdaptor instanceof VariantMongoDBAdaptor) {
// VariantMongoDBAdaptor mongoDBAdaptor = (VariantMongoDBAdaptor) dbAdaptor;
// StudyConfigurationManager studyConfigurationDBAdaptor = mongoDBAdaptor.getStudyConfigurationManager();
// StudyConfiguration studyConfigurationFromMongo = studyConfigurationDBAdaptor.getStudyConfiguration(studyConfiguration
// .getStudyId(), null).first();
//
// //Check that the provided StudyConfiguration has the same or more information that the stored in MongoDB.
// for (Map.Entry<String, Integer> entry : studyConfigurationFromMongo.getFileIds().entrySet()) {
// if (!studyConfiguration.getFileIds().containsKey(entry.getKey())) {
// throw new StorageEngineException("StudyConfiguration do not have the file " + entry.getKey());
// }
// if (!studyConfiguration.getFileIds().get(entry.getKey()).equals(entry.getValue())) {
// throw new StorageEngineException("StudyConfiguration changes the fileId of '" + entry.getKey() + "' from " +
// entry.getValue() + " to " + studyConfiguration.getFileIds().get(entry.getKey()));
// }
// }
// for (Map.Entry<String, Integer> entry : studyConfigurationFromMongo.getCohortIds().entrySet()) {
// if (!studyConfiguration.getCohortIds().containsKey(entry.getKey())) {
// throw new StorageEngineException("StudyConfiguration do not have the cohort " + entry.getKey());
// }
// if (!studyConfiguration.getCohortIds().get(entry.getKey()).equals(entry.getValue())) {
// throw new StorageEngineException("StudyConfiguration changes the cohortId of '" + entry.getKey() + "' from " +
// entry.getValue() + " to " + studyConfiguration.getCohortIds().get(entry.getKey()));
// }
// }
// for (Map.Entry<String, Integer> entry : studyConfigurationFromMongo.getSampleIds().entrySet()) {
// if (!studyConfiguration.getSampleIds().containsKey(entry.getKey())) {
// throw new StorageEngineException("StudyConfiguration do not have the sample " + entry.getKey());
// }
// if (!studyConfiguration.getSampleIds().get(entry.getKey()).equals(entry.getValue())) {
// throw new StorageEngineException("StudyConfiguration changes the sampleId of '" + entry.getKey() + "' from " +
// entry.getValue() + " to " + studyConfiguration.getSampleIds().get(entry.getKey()));
// }
// }
// studyConfigurationDBAdaptor.updateStudyConfiguration(studyConfiguration, null);
// } else {
// throw new StorageEngineException("Unknown VariantDBAdaptor '" + dbAdaptor.getClass().toString() + "'. Expected '" +
// VariantMongoDBAdaptor.class + "'");
// }
// }
// }
}