package org.opencb.opencga.storage.mongodb.variant.load; import com.mongodb.bulk.BulkWriteResult; import org.bson.Document; import org.opencb.biodata.formats.variant.io.VariantReader; import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.FileEntry; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.datastore.mongodb.MongoDBCollection; import org.opencb.commons.io.DataWriter; import org.opencb.commons.run.ParallelTaskRunner; import org.opencb.opencga.core.common.ProgressLogger; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.ExportMetadata; import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.core.variant.io.VariantImporter; import org.opencb.opencga.storage.core.variant.io.avro.VariantAvroReader; import org.opencb.opencga.storage.mongodb.variant.adaptors.VariantMongoDBAdaptor; import org.opencb.opencga.storage.mongodb.variant.converters.DocumentToSamplesConverter; import org.opencb.opencga.storage.mongodb.variant.converters.DocumentToStudyVariantEntryConverter; import org.opencb.opencga.storage.mongodb.variant.converters.DocumentToVariantConverter; import org.opencb.opencga.storage.mongodb.variant.converters.DocumentToVariantStatsConverter; import java.io.IOException; import java.net.URI; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.ExecutionException; import java.util.stream.Collectors; /** * Created on 07/12/16. * * @author Jacobo Coll <jacobo167@gmail.com> */ public class MongoVariantImporter extends VariantImporter { private final MongoDBCollection variantsCollection; public MongoVariantImporter(VariantMongoDBAdaptor dbAdaptor) { super(dbAdaptor); this.variantsCollection = dbAdaptor.getVariantsCollection(); } @Override public void importData(URI inputUri, ExportMetadata remappedMetadata, Map<StudyConfiguration, StudyConfiguration> studiesOldNewMap) throws StorageEngineException, IOException { Path input = Paths.get(inputUri.getPath()); Map<String, String> studyIdMapping = new HashMap<>(studiesOldNewMap.size() * 2); studiesOldNewMap.forEach((old, newer) -> { studyIdMapping.put(old.getStudyName(), newer.getStudyName()); studyIdMapping.put(String.valueOf(old.getStudyId()), newer.getStudyName()); }); // VariantReader variantReader = VariantReaderUtils.getVariantReader(input, null); //TODO: Read returned samples from Metadata Map<String, LinkedHashMap<String, Integer>> samplesPositions = new HashMap<>(); for (StudyConfiguration sc : remappedMetadata.getStudies()) { LinkedHashMap<String, Integer> map = StudyConfiguration.getSortedIndexedSamplesPosition(sc); // LinkedHashMap<String, Integer> map = new LinkedHashMap<>(); samplesPositions.put(sc.getStudyName(), map); samplesPositions.put(String.valueOf(sc.getStudyId()), map); studyIdMapping.entrySet().stream() .filter(entry -> entry.getValue().equals(sc.getStudyName())) .forEach(entry -> samplesPositions.put(entry.getKey(), map)); } VariantReader variantReader = new VariantAvroReader(input.toAbsolutePath().toFile(), samplesPositions); ProgressLogger progressLogger = new ProgressLogger("Loaded variants"); ParallelTaskRunner.Task<Variant, Document> converterTask = new VariantToDocumentConverter(studiesOldNewMap, progressLogger); DataWriter<Document> writer = new MongoDBVariantDocumentDBWriter(variantsCollection); ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder().setNumTasks(5).setSorted(false).setBatchSize(200).build(); ParallelTaskRunner<Variant, Document> ptr = new ParallelTaskRunner<>(variantReader, converterTask, writer, config); try { ptr.run(); } catch (ExecutionException e) { throw new StorageEngineException("", e); } } /** * Simple Task for converting Variants into MongoDB Documents. */ private static class VariantToDocumentConverter implements ParallelTaskRunner.Task<Variant, Document> { private final DocumentToVariantConverter variantConverter; private final Map<String, String> studiesIdRemap; private ProgressLogger progressLogger; VariantToDocumentConverter(Map<StudyConfiguration, StudyConfiguration> studiesOldNewMap, ProgressLogger progressLogger) { List<StudyConfiguration> studies = new ArrayList<>(studiesOldNewMap.values()); DocumentToSamplesConverter samplesConverter = new DocumentToSamplesConverter(studies); DocumentToStudyVariantEntryConverter studyConverter = new DocumentToStudyVariantEntryConverter(false, samplesConverter); DocumentToVariantStatsConverter statsConverter = new DocumentToVariantStatsConverter(studies); variantConverter = new DocumentToVariantConverter(studyConverter, statsConverter); this.studiesIdRemap = new HashMap<>(); studiesOldNewMap.forEach((old, newer) -> { this.studiesIdRemap.put(old.getStudyName(), String.valueOf(newer.getStudyId())); this.studiesIdRemap.put(String.valueOf(old.getStudyId()), String.valueOf(newer.getStudyId())); this.studiesIdRemap.put(newer.getStudyName(), String.valueOf(newer.getStudyId())); }); this.progressLogger = progressLogger; } @Override public List<Document> apply(List<Variant> batch) { progressLogger.increment(batch.size(), () -> "up to position " + batch.get(batch.size() - 1)); return batch.stream().map(variant -> { for (StudyEntry studyEntry : variant.getStudies()) { studyEntry.setStudyId(studiesIdRemap.getOrDefault(studyEntry.getStudyId(), studyEntry.getStudyId())); for (FileEntry file : studyEntry.getFiles()) { if (file.getFileId().isEmpty()) { file.setFileId("-1"); } } if (studyEntry.getSamplesData() == null) { studyEntry.setSamplesData(Collections.emptyList()); } } return variant; }).map(variantConverter::convertToStorageType).collect(Collectors.toList()); } } /** * Simple DataWriter for importing data into MongoDB. */ private static class MongoDBVariantDocumentDBWriter implements DataWriter<Document> { private final MongoDBCollection collection; private int insertedCount = 0; MongoDBVariantDocumentDBWriter(MongoDBCollection collection) { this.collection = collection; } @Override public boolean write(List<Document> batch) { BulkWriteResult result = collection.insert(batch, QueryOptions.empty()).first(); insertedCount += result.getInsertedCount(); return true; } @Override public boolean post() { VariantMongoDBAdaptor.createIndexes(new QueryOptions(), collection); return true; } public int getInsertedCount() { return insertedCount; } } }