/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.mongodb.variant; import org.apache.log4j.ConsoleAppender; import org.apache.log4j.Level; import org.apache.log4j.LogManager; import org.bson.Document; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.opencb.biodata.formats.variant.io.VariantReader; import org.opencb.biodata.formats.variant.vcf4.io.VariantVcfReader; import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantSource; import org.opencb.commons.containers.list.SortedList; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.datastore.mongodb.MongoDBCollection; import org.opencb.commons.io.DataWriter; import org.opencb.commons.run.Runner; import org.opencb.commons.run.Task; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest; import org.opencb.opencga.storage.core.variant.VariantStorageEngine; import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor; import org.opencb.opencga.storage.mongodb.variant.adaptors.VariantMongoDBAdaptor; import org.opencb.opencga.storage.mongodb.variant.adaptors.VariantMongoDBWriter; import org.opencb.opencga.storage.mongodb.variant.load.*; import org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageConverterTask; import org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageLoader; import org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageReader; import org.opencb.opencga.storage.mongodb.variant.load.variants.MongoDBOperations; import org.opencb.opencga.storage.mongodb.variant.load.variants.MongoDBVariantMergeLoader; import org.opencb.opencga.storage.mongodb.variant.load.variants.MongoDBVariantMerger; import java.io.IOException; import java.util.*; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Function; import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; import static org.opencb.opencga.storage.mongodb.variant.converters.DocumentToSamplesConverter.UNKNOWN_FIELD; import static org.opencb.opencga.storage.mongodb.variant.converters.DocumentToSamplesConverter.UNKNOWN_GENOTYPE; /** * @author Jacobo Coll <jacobo167@gmail.com> */ public class VariantMongoDBWriterTest implements MongoDBVariantStorageTest { private static String inputFile; private static MongoDBVariantStorageEngine variantStorageManager; private VariantSource source1, source2, source3; private StudyConfiguration studyConfiguration, studyConfiguration2; private final Integer fileId1 = 10000; private final Integer fileId2 = 20000; private final Integer fileId3 = 30000; private Integer studyId1 = 1; private Integer studyId2 = 2; private String studyName1 = "Study 1"; private String studyName2 = "Study 2"; private VariantMongoDBAdaptor dbAdaptor; private LinkedHashSet<Integer> file1SampleIds; private LinkedHashSet<Integer> file2SampleIds; private LinkedHashSet<Integer> file3SampleIds; @Before public void setUp() throws Exception { ConsoleAppender stderr = (ConsoleAppender) LogManager.getRootLogger().getAppender("stderr"); stderr.setThreshold(Level.toLevel("debug")); inputFile = VariantStorageBaseTest.getResourceUri("variant-test-file.vcf.gz").getPath(); clearDB(VariantStorageBaseTest.DB_NAME); variantStorageManager = getVariantStorageEngine(); source1 = new VariantSource(getFileName(fileId1), fileId1.toString(), studyId1.toString(), studyName1); studyConfiguration = new StudyConfiguration(studyId1, studyName1); studyConfiguration.getSampleIds().put("NA19600", 1); studyConfiguration.getSampleIds().put("NA19660", 2); studyConfiguration.getSampleIds().put("NA19661", 3); studyConfiguration.getSampleIds().put("NA19685", 4); file1SampleIds = new LinkedHashSet<>(Arrays.asList(1, 2, 3, 4)); studyConfiguration.getFileIds().put(getFileName(fileId1), fileId1); studyConfiguration.getSamplesInFiles().put(fileId1, file1SampleIds); source2 = new VariantSource(getFileName(fileId2), fileId2.toString(), studyId2.toString(), studyName2); studyConfiguration2 = new StudyConfiguration(studyId2, studyName2); studyConfiguration2.getSampleIds().put("NA19600", 1); studyConfiguration2.getSampleIds().put("NA19660", 2); studyConfiguration2.getSampleIds().put("NA19661", 3); studyConfiguration2.getSampleIds().put("NA19685", 4); file2SampleIds = new LinkedHashSet<>(Arrays.asList(1, 2, 3, 4)); studyConfiguration2.getFileIds().put(getFileName(fileId2), fileId2); studyConfiguration2.getSamplesInFiles().put(fileId2, file2SampleIds); source3 = new VariantSource(getFileName(fileId3), fileId3.toString(), studyId2.toString(), studyName2); studyConfiguration2.getSampleIds().put("NA00001.X", 5); studyConfiguration2.getSampleIds().put("NA00002.X", 6); studyConfiguration2.getSampleIds().put("NA00003.X", 7); studyConfiguration2.getSampleIds().put("NA00004.X", 8); file3SampleIds = new LinkedHashSet<>(Arrays.asList(5, 6, 7, 8)); studyConfiguration2.getFileIds().put(source3.getFileName(), fileId3); studyConfiguration2.getSamplesInFiles().put(fileId3, file3SampleIds); dbAdaptor = variantStorageManager.getDBAdaptor(VariantStorageBaseTest.DB_NAME); } @After public void shutdown() throws Exception { } @Test public void test() throws IOException, StorageEngineException { VariantReader reader = new VariantVcfReader(source1, inputFile); List<Task<Variant>> taskList = new SortedList<>(); List<DataWriter<Variant>> writers = new ArrayList<>(); writers.add(new VariantMongoDBWriter(fileId1, studyConfiguration, dbAdaptor, true, false)); // studyConfiguration.getCohorts().put(cohortId, new HashSet<>(Arrays.asList(1, 2, 3, 4))); // studyConfiguration.getCohortIds().put(VariantSourceEntry.DEFAULT_COHORT, cohortId); // for (VariantWriter vw : writers) { // vw.includeStats(true); // } // taskList.add(new VariantStatsTask(reader, study1)); Runner<Variant> vr = new Runner<>(reader, writers, taskList, 200); vr.run(); } /** * Insert some variants. * +-------+---------------+ * | Study1| Study2 | * +-----------|-------+---------------+ * | Variant | File1 | File2 | File3 | * +-----------+-------+-------+-------+ // Check merging having other loaded studies * | 999 | x | | | * | 1000 | x | x | x | * | 1002 | x | | x | * | 1004 | | x | | * | 1006 | | | x | * +-----------+-------+-------+-------+ * */ @Test public void testInsertMultiFiles() throws StorageEngineException { List<Variant> allVariants; studyConfiguration.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key(), Arrays.asList("GQX", "DP")); studyConfiguration.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS_TYPE.key(), Arrays.asList("Float", "Integer")); studyConfiguration2.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key(), Arrays.asList("DP", "GQX")); studyConfiguration2.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS_TYPE.key(), Arrays.asList("Integer", "Float")); assertEquals(new MongoDBVariantWriteResult(3, 0, 0, 0, 0, 0), clearTime(loadFile1())); allVariants = dbAdaptor.get(new Query(), new QueryOptions("sort", true)).getResult(); assertEquals(3, allVariants.size()); assertEquals(new MongoDBVariantWriteResult(1, 1, 0, 0, 0, 0), clearTime(loadFile2())); allVariants = dbAdaptor.get(new Query(), new QueryOptions("sort", true)).getResult(); assertEquals(4, allVariants.size()); assertEquals(new MongoDBVariantWriteResult(1, 2, 1, 0, 0, 0), clearTime(loadFile3())); allVariants = dbAdaptor.get(new Query(), new QueryOptions("sort", true)).getResult(); assertEquals(5, allVariants.size()); checkLoadedVariants(allVariants); } @Test public void testInsertMultiFilesMultiMerge() throws StorageEngineException { List<Variant> allVariants; studyConfiguration.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key(), Arrays.asList("GQX", "DP")); studyConfiguration.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS_TYPE.key(), Arrays.asList("Float", "Integer")); studyConfiguration2.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key(), Arrays.asList("DP", "GQX")); studyConfiguration2.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS_TYPE.key(), Arrays.asList("Integer", "Float")); assertEquals(new MongoDBVariantWriteResult(3, 0, 0, 0, 0, 0), clearTime(loadFile1())); allVariants = dbAdaptor.get(new Query(), new QueryOptions("sort", true)).getResult(); assertEquals(3, allVariants.size()); MongoDBVariantWriteResult writeResult = new MongoDBVariantWriteResult(); writeResult.merge(stageVariants(studyConfiguration2, createFile2Variants(), fileId2)); writeResult.merge(stageVariants(studyConfiguration2, createFile3Variants(), fileId3)); writeResult = mergeVariants(studyConfiguration2, Arrays.asList(fileId2, fileId3), writeResult, Collections.emptyList()); assertEquals(new MongoDBVariantWriteResult(2, 2, 0, 0, 0, 0), clearTime(writeResult)); allVariants = dbAdaptor.get(new Query(), new QueryOptions("sort", true)).getResult(); assertEquals(5, allVariants.size()); checkLoadedVariants(allVariants); } /** * Insert variants chromosome by chromosome * * @throws StorageEngineException */ @Test public void testInsertMultiFilesMultipleRegions() throws StorageEngineException { List<Variant> allVariants; studyConfiguration.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key(), Arrays.asList("GQX", "DP")); studyConfiguration.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS_TYPE.key(), Arrays.asList("Float", "Integer")); studyConfiguration2.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key(), Arrays.asList("DP", "GQX")); studyConfiguration2.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS_TYPE.key(), Arrays.asList("Integer", "Float")); int i = 1; for (String chr : Arrays.asList("1", "2", "3")) { Query query = new Query(VariantDBAdaptor.VariantQueryParams.REGION.key(), chr); assertEquals(new MongoDBVariantWriteResult(3, 0, 0, 0, 0, 0), clearTime(loadFile1(chr, i++, Collections.singletonList(chr)))); allVariants = dbAdaptor.get(query, new QueryOptions("sort", true)).getResult(); assertEquals(3, allVariants.size()); assertEquals(new MongoDBVariantWriteResult(1, 1, 0, 0, 0, 0), clearTime(loadFile2(chr, i++, Collections.singletonList(chr)))); allVariants = dbAdaptor.get(query, new QueryOptions("sort", true)).getResult(); assertEquals(4, allVariants.size()); assertEquals(new MongoDBVariantWriteResult(1, 2, 1, 0, 0, 0), clearTime(loadFile3(chr, i++, Collections.singletonList(chr)))); allVariants = dbAdaptor.get(query, new QueryOptions("sort", true)).getResult(); assertEquals(5, allVariants.size()); checkLoadedVariants(allVariants); } } /** * Insert variants study by study * * @throws StorageEngineException */ @Test public void testInsertMultiFilesMultipleRegionsStudyByStudy() throws StorageEngineException { List<Variant> allVariants; studyConfiguration.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key(), Arrays.asList("GQX", "DP")); studyConfiguration.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS_TYPE.key(), Arrays.asList("Float", "Integer")); studyConfiguration2.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key(), Arrays.asList("DP", "GQX")); studyConfiguration2.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS_TYPE.key(), Arrays.asList("Integer", "Float")); int i = 1; List<String> chromosomes = Arrays.asList("1", "2", "3", "4"); Map<String, int[]> mapFileIds = new HashMap<>(); for (String chr : chromosomes) { mapFileIds.put(chr, new int[]{i++, i++, i++}); Query query = new Query(VariantDBAdaptor.VariantQueryParams.REGION.key(), chr); assertEquals(new MongoDBVariantWriteResult(2, 0, 0, 0, 0, 0), clearTime(loadFile2(chr, mapFileIds.get(chr)[1], Collections.singletonList(chr)))); allVariants = dbAdaptor.get(query, new QueryOptions("sort", true)).getResult(); assertEquals(2, allVariants.size()); assertEquals(new MongoDBVariantWriteResult(2, 1, 1, 0, 0, 0), clearTime(loadFile3(chr, mapFileIds.get(chr)[2], Collections.singletonList(chr)))); allVariants = dbAdaptor.get(query, new QueryOptions("sort", true)).getResult(); assertEquals(4, allVariants.size()); } for (String chr : chromosomes) { Query query = new Query(VariantDBAdaptor.VariantQueryParams.REGION.key(), chr); assertEquals(new MongoDBVariantWriteResult(1, 2, 0, 0, 0, 0), clearTime(loadFile1(chr, mapFileIds.get(chr)[0], Collections.singletonList(chr)))); allVariants = dbAdaptor.get(query, new QueryOptions("sort", true)).getResult(); assertEquals(5, allVariants.size()); } for (String chr : chromosomes) { Query query = new Query(VariantDBAdaptor.VariantQueryParams.REGION.key(), chr); allVariants = dbAdaptor.get(query, new QueryOptions("sort", true)).getResult(); checkLoadedVariants(allVariants, mapFileIds.get(chr)); } } /** * Insert variants study by study * * @throws StorageEngineException */ @Test public void testInsertMultiFilesMultipleRegionsStudyByStudy2() throws StorageEngineException { List<Variant> allVariants; studyConfiguration.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key(), Arrays.asList("GQX", "DP")); studyConfiguration.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS_TYPE.key(), Arrays.asList("Float", "Integer")); studyConfiguration2.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key(), Arrays.asList("DP", "GQX")); studyConfiguration2.getAttributes().put(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS_TYPE.key(), Arrays.asList("Integer", "Float")); int i = 1; List<String> chromosomes = Arrays.asList("1", "2", "X", "3", "5", "4"); // List<String> chromosomes = Arrays.asList("4", "3", "2", "1"); Map<String, int[]> mapFileIds = new HashMap<>(); for (String chr : chromosomes) { mapFileIds.put(chr, new int[]{i++, i++, i++}); Query query = new Query(VariantDBAdaptor.VariantQueryParams.REGION.key(), chr); assertEquals(new MongoDBVariantWriteResult(3, 0, 0, 0, 0, 0), clearTime(loadFile1(chr, mapFileIds.get(chr)[0], Collections.singletonList(chr)))); allVariants = dbAdaptor.get(query, new QueryOptions("sort", true)).getResult(); assertEquals(3, allVariants.size()); } for (String chr : chromosomes) { Query query = new Query(VariantDBAdaptor.VariantQueryParams.REGION.key(), chr); assertEquals(new MongoDBVariantWriteResult(1, 1, 0, 0, 0, 0), clearTime(loadFile2(chr, mapFileIds.get(chr)[1], Collections.singletonList(chr)))); allVariants = dbAdaptor.get(query, new QueryOptions("sort", true)).getResult(); assertEquals(4, allVariants.size()); assertEquals(new MongoDBVariantWriteResult(1, 2, 1, 0, 0, 0), clearTime(loadFile3(chr, mapFileIds.get(chr)[2], Collections.singletonList(chr)))); allVariants = dbAdaptor.get(query, new QueryOptions("sort", true)).getResult(); assertEquals(5, allVariants.size()); } for (String chr : chromosomes) { Query query = new Query(VariantDBAdaptor.VariantQueryParams.REGION.key(), chr); allVariants = dbAdaptor.get(query, new QueryOptions("sort", true)).getResult(); checkLoadedVariants(allVariants, mapFileIds.get(chr)); } } public void checkLoadedVariants(List<Variant> allVariants) { checkLoadedVariants(allVariants, new int[]{fileId1, fileId2, fileId3}); } public void checkLoadedVariants(List<Variant> allVariants, int[] fileIds) { Variant variant; variant = allVariants.get(0); assertEquals(999, variant.getStart().longValue()); assertEquals(Collections.singleton(studyName1), variant.getStudiesMap().keySet()); variant = allVariants.get(1); assertEquals(1000, variant.getStart().longValue()); assertEquals(new HashSet<>(Arrays.asList(studyName1, studyName2)), variant.getStudiesMap().keySet()); checkSampleData(variant, studyConfiguration, fileIds[0], (sampleId) -> Integer.toString(sampleId + 10), "DP"); variant = allVariants.get(2); assertEquals(1002, variant.getStart().longValue()); assertEquals(new HashSet<>(Arrays.asList(studyName1, studyName2)), variant.getStudiesMap().keySet()); checkSampleData(variant, studyConfiguration, fileIds[0], (sampleId) -> Integer.toString(sampleId + 10), "DP"); checkSampleData(variant, studyConfiguration2, fileIds[1], (sampleId) -> UNKNOWN_FIELD, "DP"); checkSampleData(variant, studyConfiguration2, fileIds[1], (sampleId) -> UNKNOWN_GENOTYPE, "GT"); checkSampleData(variant, studyConfiguration2, fileIds[2], Object::toString, "DP"); variant = allVariants.get(3); assertEquals(1004, variant.getStart().longValue()); assertEquals(Collections.singleton(studyName2), variant.getStudiesMap().keySet()); checkSampleData(variant, studyConfiguration2, fileIds[1], Object::toString, "DP"); checkSampleData(variant, studyConfiguration2, fileIds[2], (sampleId) -> UNKNOWN_FIELD, "DP"); checkSampleData(variant, studyConfiguration2, fileIds[2], (sampleId) -> UNKNOWN_GENOTYPE, "GT"); checkSampleData(variant, studyConfiguration2, fileIds[1], (sampleId) -> sampleId % 2 == 0 ? UNKNOWN_FIELD : "0.7", "GQX"); checkSampleData(variant, studyConfiguration2, fileIds[2], (sampleId) -> UNKNOWN_FIELD, "GQX"); variant = allVariants.get(4); assertEquals(1006, variant.getStart().longValue()); assertEquals(Collections.singleton(studyName2), variant.getStudiesMap().keySet()); checkSampleData(variant, studyConfiguration2, fileIds[1], (sampleId) -> UNKNOWN_FIELD, "DP"); checkSampleData(variant, studyConfiguration2, fileIds[1], (sampleId) -> UNKNOWN_FIELD, "GQX"); checkSampleData(variant, studyConfiguration2, fileIds[1], (sampleId) -> UNKNOWN_GENOTYPE, "GT"); checkSampleData(variant, studyConfiguration2, fileIds[2], Object::toString, "DP"); checkSampleData(variant, studyConfiguration2, fileIds[2], (sampleId) -> "0.7", "GQX"); } public void checkSampleData(Variant variant, StudyConfiguration studyConfiguration, Integer fileId, Function<Integer, String> valueProvider, String field) { assertTrue(studyConfiguration.getFileIds().values().contains(fileId)); studyConfiguration.getSamplesInFiles().get(fileId).forEach((sampleId) -> { String sampleName = studyConfiguration.getSampleIds().inverse().get(sampleId); StudyEntry study = variant.getStudy(studyConfiguration.getStudyName()); assertTrue(study.getSamplesName().contains(sampleName)); assertEquals("FileId=" + fileId + " Field=" + field + " Sample=" + sampleName + " (" + sampleId + ")", valueProvider.apply(sampleId), study.getSampleData(sampleName, field)); }); } public MongoDBVariantWriteResult loadFile1() throws StorageEngineException { return loadFile1("X", Integer.parseInt(source1.getFileId()), Collections.emptyList()); } public MongoDBVariantWriteResult loadFile1(String chromosome, Integer fileId, List<String> chromosomes) throws StorageEngineException { studyConfiguration.getFileIds().putIfAbsent(getFileName(fileId), fileId); studyConfiguration.getSamplesInFiles().putIfAbsent(fileId, file1SampleIds); System.out.println("chromosome = " + chromosome); System.out.println("fileId = " + fileId); System.out.println("samples = " + file1SampleIds.stream().map(i -> studyConfiguration.getSampleIds().inverse().get(i)).collect(Collectors.toList()) + " : " + file1SampleIds); return loadFile(studyConfiguration, createFile1Variants(chromosome, fileId.toString(), Integer.toString(studyConfiguration.getStudyId())), fileId, chromosomes); } public MongoDBVariantWriteResult loadFile2() throws StorageEngineException { return loadFile2("X", Integer.parseInt(source2.getFileId()), Collections.emptyList()); } public MongoDBVariantWriteResult loadFile2(String chromosome, Integer fileId, List<String> chromosomes) throws StorageEngineException { studyConfiguration2.getFileIds().putIfAbsent(getFileName(fileId), fileId); studyConfiguration2.getSamplesInFiles().putIfAbsent(fileId, file2SampleIds); System.out.println("chromosome = " + chromosome); System.out.println("fileId = " + fileId); System.out.println("samples = " + file2SampleIds.stream().map(i -> studyConfiguration2.getSampleIds().inverse().get(i)).collect(Collectors.toList()) + " : " + file2SampleIds); return loadFile(studyConfiguration2, createFile2Variants(chromosome, fileId.toString(), source2.getStudyId()), fileId, chromosomes); } public MongoDBVariantWriteResult loadFile3() throws StorageEngineException { return loadFile3("X", Integer.parseInt(source3.getFileId()), Collections.emptyList()); } public MongoDBVariantWriteResult loadFile3(String chromosome, Integer fileId, List<String> chromosomes) throws StorageEngineException { studyConfiguration2.getFileIds().putIfAbsent(getFileName(fileId), fileId); studyConfiguration2.getSamplesInFiles().putIfAbsent(fileId, file3SampleIds); System.out.println("chromosome = " + chromosome); System.out.println("fileId = " + fileId); System.out.println("samples = " + file3SampleIds.stream().map(i -> studyConfiguration2.getSampleIds().inverse().get(i)).collect(Collectors.toList()) + " : " + file3SampleIds); return loadFile(studyConfiguration2, createFile3Variants(chromosome, fileId.toString(), source3.getStudyId()), fileId, chromosomes); } public MongoDBVariantWriteResult loadFile(StudyConfiguration studyConfiguration, List<Variant> variants, int fileId) throws StorageEngineException { return loadFile(studyConfiguration, variants, fileId, Collections.emptyList()); } public MongoDBVariantWriteResult loadFile(StudyConfiguration studyConfiguration, List<Variant> variants, int fileId, List<String> chromosomes) throws StorageEngineException { // return loadFileOld(studyConfiguration, variants, fileId); MongoDBVariantWriteResult stageWriteResult = stageVariants(studyConfiguration, variants, fileId); return mergeVariants(studyConfiguration, Collections.singletonList(fileId), stageWriteResult, chromosomes); } public MongoDBVariantWriteResult loadFileOld(StudyConfiguration studyConfiguration, List<Variant> variants, int fileId) throws StorageEngineException { VariantMongoDBWriter mongoDBWriter; mongoDBWriter = new VariantMongoDBWriter(fileId, studyConfiguration, dbAdaptor, true, false); mongoDBWriter.setThreadSynchronizationBoolean(new AtomicBoolean(false)); mongoDBWriter.open(); mongoDBWriter.pre(); variants.forEach(mongoDBWriter::write); mongoDBWriter.post(); mongoDBWriter.close(); studyConfiguration.getIndexedFiles().add(fileId); return mongoDBWriter.getWriteResult(); } public MongoDBVariantWriteResult stageVariants(StudyConfiguration studyConfiguration, List<Variant> variants, int fileId) { MongoDBCollection stage = dbAdaptor.getStageCollection(); MongoDBVariantStageLoader variantStageLoader = new MongoDBVariantStageLoader(stage, studyConfiguration.getStudyId(), fileId, false); MongoDBVariantStageConverterTask converterTask = new MongoDBVariantStageConverterTask(null); variantStageLoader.write(converterTask.apply(variants)); return variantStageLoader.getWriteResult(); } public MongoDBVariantWriteResult mergeVariants(StudyConfiguration studyConfiguration, int fileId, MongoDBVariantWriteResult stageWriteResult) { return mergeVariants(studyConfiguration, Collections.singletonList(fileId), stageWriteResult, Collections.emptyList()); } public MongoDBVariantWriteResult mergeVariants(StudyConfiguration studyConfiguration, List<Integer> fileIds, MongoDBVariantWriteResult stageWriteResult, List<String> chromosomes) { MongoDBCollection stage = dbAdaptor.getStageCollection(); MongoDBCollection variantsCollection = dbAdaptor.getVariantsCollection(); MongoDBVariantStageReader reader = new MongoDBVariantStageReader(stage, studyConfiguration.getStudyId(), chromosomes); MongoDBVariantMerger dbMerger = new MongoDBVariantMerger(dbAdaptor, studyConfiguration, fileIds, studyConfiguration.getIndexedFiles(), false, false); boolean cleanWhileLoading = true; boolean resume = false; MongoDBVariantMergeLoader variantLoader = new MongoDBVariantMergeLoader(variantsCollection, dbAdaptor.getStageCollection(), studyConfiguration.getStudyId(), fileIds, resume, cleanWhileLoading, null); reader.open(); reader.pre(); List<Document> batch = reader.read(100); while (batch != null && !batch.isEmpty()) { List<MongoDBOperations> apply = dbMerger.apply(batch); variantLoader.write(apply); batch = reader.read(100); } reader.post(); reader.close(); long cleanedDocuments = MongoDBVariantStageLoader.cleanStageCollection(stage, studyConfiguration.getStudyId(), fileIds, null, null); if (cleanWhileLoading) { assertEquals(0, cleanedDocuments); } else { assertNotEquals(0, cleanedDocuments); } studyConfiguration.getIndexedFiles().addAll(fileIds); dbAdaptor.getStudyConfigurationManager().updateStudyConfiguration(studyConfiguration, null); return variantLoader.getResult().setSkippedVariants(stageWriteResult.getSkippedVariants()); } public List<Variant> createFile1Variants() { return createFile1Variants("X", source1.getFileId(), source1.getStudyId()); } public List<Variant> createFile2Variants() { return createFile2Variants("X", source2.getFileId(), source2.getStudyId()); } public List<Variant> createFile3Variants() { return createFile3Variants("X", source3.getFileId(), source3.getStudyId()); } @SuppressWarnings("unchecked") public static List<Variant> createFile1Variants(String chromosome, String fileId, String studyId) { Variant variant; StudyEntry sourceEntry; List<Variant> variants = new LinkedList<>(); variant = new Variant(chromosome, 999, 999, "A", "C"); sourceEntry = new StudyEntry(fileId, studyId); sourceEntry.addSampleData("NA19600", ((Map) new ObjectMap("GT", "./.").append("DP", "11").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19660", ((Map) new ObjectMap("GT", "1/1").append("DP", "12").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19661", ((Map) new ObjectMap("GT", "0/0").append("DP", "13").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19685", ((Map) new ObjectMap("GT", "1/0").append("DP", "14").append("GQX", "0.7"))); variant.addStudyEntry(sourceEntry); variants.add(variant); variant = new Variant(chromosome, 1000, 1000, "A", "C"); sourceEntry = new StudyEntry(fileId, studyId); sourceEntry.addSampleData("NA19600", ((Map) new ObjectMap("GT", "./.").append("DP", "11").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19660", ((Map) new ObjectMap("GT", "1/1").append("DP", "12").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19661", ((Map) new ObjectMap("GT", "0/0").append("DP", "13").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19685", ((Map) new ObjectMap("GT", "1/0").append("DP", "14").append("GQX", "0.7"))); variant.addStudyEntry(sourceEntry); variants.add(variant); variant = new Variant(chromosome, 1002, 1002, "A", "C"); sourceEntry = new StudyEntry(fileId, studyId); sourceEntry.addSampleData("NA19600", ((Map) new ObjectMap("GT", "0/1").append("DP", "11").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19660", ((Map) new ObjectMap("GT", "0/0").append("DP", "12").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19661", ((Map) new ObjectMap("GT", "1/0").append("DP", "13").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19685", ((Map) new ObjectMap("GT", "0/0").append("DP", "14").append("GQX", "0.7"))); variant.addStudyEntry(sourceEntry); variants.add(variant); return variants; } @SuppressWarnings("unchecked") public static List<Variant> createFile2Variants(String chromosome, String fileId, String studyId) { Variant variant; StudyEntry sourceEntry; List<Variant> variants = new LinkedList<>(); variant = new Variant(chromosome, 1000, 1000, "A", "C"); sourceEntry = new StudyEntry(fileId, studyId); sourceEntry.addSampleData("NA19600", ((Map) new ObjectMap("GT", "./.").append("DP", "1").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19660", ((Map) new ObjectMap("GT", "1/1").append("DP", "2").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19661", ((Map) new ObjectMap("GT", "0/0").append("DP", "3").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19685", ((Map) new ObjectMap("GT", "1/0").append("DP", "4").append("GQX", "0.7"))); variant.addStudyEntry(sourceEntry); variants.add(variant); variant = new Variant(chromosome, 1004, 1004, "A", "C"); sourceEntry = new StudyEntry(fileId, studyId); sourceEntry.addSampleData("NA19600", ((Map) new ObjectMap("GT", "0/1").append("DP", "1").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19660", ((Map) new ObjectMap("GT", "0/0").append("DP", "2").append("GQX", "."))); sourceEntry.addSampleData("NA19661", ((Map) new ObjectMap("GT", "1/0").append("DP", "3").append("GQX", "0.7"))); sourceEntry.addSampleData("NA19685", ((Map) new ObjectMap("GT", "0/0").append("DP", "4").append("GQX", ".."))); variant.addStudyEntry(sourceEntry); variants.add(variant); return variants; } @SuppressWarnings("unchecked") public static List<Variant> createFile3Variants(String chromosome, String fileId, String studyId) { Variant variant; StudyEntry sourceEntry; List<Variant> variants = new LinkedList<>(); variant = new Variant(chromosome, 1000, 1000, "A", "C"); sourceEntry = new StudyEntry(fileId, studyId); sourceEntry.addSampleData("NA00001.X", ((Map) new ObjectMap("GT", "0/1").append("DP", "5").append("GQX", "0.7"))); sourceEntry.addSampleData("NA00002.X", ((Map) new ObjectMap("GT", "0/0").append("DP", "6").append("GQX", "0.7"))); sourceEntry.addSampleData("NA00003.X", ((Map) new ObjectMap("GT", "1/0").append("DP", "7").append("GQX", "0.7"))); sourceEntry.addSampleData("NA00004.X", ((Map) new ObjectMap("GT", "0/0").append("DP", "8").append("GQX", "0.7"))); variant.addStudyEntry(sourceEntry); variants.add(variant); variant = new Variant(chromosome, 1002, 1002, "A", "C"); sourceEntry = new StudyEntry(fileId, studyId); sourceEntry.addSampleData("NA00001.X", ((Map) new ObjectMap("GT", "0/1").append("DP", "5").append("GQX", "0.7"))); sourceEntry.addSampleData("NA00002.X", ((Map) new ObjectMap("GT", "0/0").append("DP", "6").append("GQX", "0.7"))); sourceEntry.addSampleData("NA00003.X", ((Map) new ObjectMap("GT", "1/0").append("DP", "7").append("GQX", "0.7"))); sourceEntry.addSampleData("NA00004.X", ((Map) new ObjectMap("GT", "0/0").append("DP", "8").append("GQX", "0.7"))); variant.addStudyEntry(sourceEntry); variants.add(variant); variant = new Variant(chromosome, 1006, 1006, "A", "C"); sourceEntry = new StudyEntry(fileId, studyId); sourceEntry.addSampleData("NA00001.X", ((Map) new ObjectMap("GT", "0/1").append("DP", "5").append("GQX", "0.7"))); sourceEntry.addSampleData("NA00002.X", ((Map) new ObjectMap("GT", "0/0").append("DP", "6").append("GQX", "0.7"))); sourceEntry.addSampleData("NA00003.X", ((Map) new ObjectMap("GT", "1/0").append("DP", "7").append("GQX", "0.7"))); sourceEntry.addSampleData("NA00004.X", ((Map) new ObjectMap("GT", "0/0").append("DP", "8").append("GQX", "0.7"))); variant.addStudyEntry(sourceEntry); variants.add(variant); return variants; } @Test public void testInsertSameVariantTwice() throws StorageEngineException { String chromosome = "1"; loadFile1(chromosome, fileId1, Collections.singletonList(chromosome)); loadFile2(chromosome, fileId2, Collections.singletonList(chromosome)); Integer fileId = fileId3; studyConfiguration2.getFileIds().putIfAbsent(getFileName(fileId), fileId); studyConfiguration2.getSamplesInFiles().putIfAbsent(fileId, file3SampleIds); List<Variant> file3Variants = createFile3Variants(chromosome, fileId.toString(), source3.getStudyId()); file3Variants.add(file3Variants.get(2)); MongoDBVariantWriteResult result = loadFile(studyConfiguration2, file3Variants, fileId, Collections.singletonList(chromosome)); assertEquals(new MongoDBVariantWriteResult(0, 2, 1, 0, 0, 2), clearTime(result)); } public MongoDBVariantWriteResult clearTime(MongoDBVariantWriteResult writeResult) { return writeResult.setExistingVariantsNanoTime(0).setFillGapsNanoTime(0).setNewVariantsNanoTime(0); } public static String getFileName(Integer fileId) { return fileId + "_file.vcf"; } }