/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.core.variant.io;
import org.junit.*;
import org.opencb.biodata.formats.variant.io.VariantReader;
import org.opencb.biodata.formats.variant.vcf4.io.VariantVcfReader;
import org.opencb.biodata.models.core.Region;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantNormalizer;
import org.opencb.biodata.models.variant.VariantSource;
import org.opencb.biodata.models.variant.avro.VariantType;
import org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryResult;
import org.opencb.opencga.storage.core.StoragePipelineResult;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor;
import java.io.*;
import java.net.URI;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
/**
* Created by jmmut on 2015-07-15.
*
* @author Jose Miguel Mut Lopez <jmmut@ebi.ac.uk>
*/
@Ignore
public abstract class VariantVcfExporterTest extends VariantStorageBaseTest {
public static final String[] VCF_TEST_FILE_NAMES = {
"1000g_batches/1-500.filtered.10k.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz",
"1000g_batches/501-1000.filtered.10k.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz",
"1000g_batches/1001-1500.filtered.10k.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz",
"1000g_batches/1501-2000.filtered.10k.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz",
"1000g_batches/2001-2504.filtered.10k.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz",
};
public static final String EXPORTED_FILE_NAME = "exported-variant-test-file.vcf.gz";
private static URI[] inputUri;
private static StoragePipelineResult[] etlResult;
private VariantDBAdaptor dbAdaptor;
protected QueryOptions options;
protected QueryResult<Variant> queryResult;
protected static StudyConfiguration studyConfiguration;
@BeforeClass
public static void beforeClass() throws IOException {
inputUri = new URI[VCF_TEST_FILE_NAMES.length];
etlResult = new StoragePipelineResult[VCF_TEST_FILE_NAMES.length];
for (int i = 0; i < VCF_TEST_FILE_NAMES.length; i++) {
etlResult[i] = null;
inputUri[i] = getResourceUri(VCF_TEST_FILE_NAMES[i]);
}
}
@Override
@Before
public void before() throws Exception {
if (studyConfiguration == null) {
clearDB(DB_NAME);
studyConfiguration = newStudyConfiguration();
}
for (int i = 0; i < VCF_TEST_FILE_NAMES.length; i++) {
if (etlResult[i] == null) {
etlResult[i] = runDefaultETL(inputUri[i], getVariantStorageEngine(), studyConfiguration,
new ObjectMap(VariantStorageEngine.Options.ANNOTATE.key(), false)
.append(VariantStorageEngine.Options.FILE_ID.key(), i)
.append(VariantStorageEngine.Options.CALCULATE_STATS.key(), false));
}
}
dbAdaptor = getVariantStorageEngine().getDBAdaptor(DB_NAME);
}
@After
public void after() throws IOException {
dbAdaptor.close();
}
@Test
public void testVcfHtsExportSingleFile() throws Exception {
Query query = new Query();
LinkedHashSet<Integer> returnedSamplesIds = dbAdaptor.getStudyConfigurationManager().getStudyConfiguration(STUDY_NAME, null)
.first().getSamplesInFiles().get(0);
List<String> returnedSamples = new LinkedList<>();
Map<Integer, String> sampleIdMap = StudyConfiguration.inverseMap(studyConfiguration.getSampleIds());
for (Integer sampleId : returnedSamplesIds) {
returnedSamples.add(sampleIdMap.get(sampleId));
}
query.append(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), STUDY_NAME)
.append(VariantDBAdaptor.VariantQueryParams.RETURNED_FILES.key(), 0)
.append(VariantDBAdaptor.VariantQueryParams.FILES.key(), 0)
.append(VariantDBAdaptor.VariantQueryParams.RETURNED_SAMPLES.key(), returnedSamples);
Path outputVcf = getTmpRootDir().resolve("hts_sf_" + EXPORTED_FILE_NAME);
int failedVariants = VariantVcfDataWriter.htsExport(dbAdaptor.iterator(query, new QueryOptions(QueryOptions.SORT, true)),
studyConfiguration, dbAdaptor.getVariantSourceDBAdaptor()
, new GZIPOutputStream(new FileOutputStream(outputVcf.toFile())), new QueryOptions(VariantDBAdaptor.VariantQueryParams
.RETURNED_SAMPLES.key(), returnedSamples));
assertEquals(0, failedVariants);
// compare VCF_TEST_FILE_NAME and EXPORTED_FILE_NAME
checkExportedVCF(Paths.get(getResourceUri("1000g_batches/1-500.filtered.10k.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf" +
".gz")), outputVcf, new Region("22"));
}
@Test
public void testVcfHtsExportMultiFile() throws Exception {
Query query = new Query();
query.append(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), STUDY_NAME);
// .append(VariantDBAdaptor.VariantQueryParams.REGION.key(), region);
Path outputVcf = getTmpRootDir().resolve("hts_mf_" + EXPORTED_FILE_NAME);
int failedVariants = VariantVcfDataWriter.htsExport(dbAdaptor.iterator(query, new QueryOptions(QueryOptions.SORT, true)), studyConfiguration,
dbAdaptor.getVariantSourceDBAdaptor(),
new GZIPOutputStream(new FileOutputStream(outputVcf.toFile())), null);
assertEquals(0, failedVariants);
// compare VCF_TEST_FILE_NAME and EXPORTED_FILE_NAME
Path originalVcf = Paths.get(getResourceUri("filtered.10k.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz"));
VariantVcfReader variantVcfReader = new VariantVcfReader(new VariantSource(originalVcf.getFileName().toString(), "f", "s", ""),
originalVcf.toString());
variantVcfReader.open();
variantVcfReader.pre();
Region region = new Region("22", 16000000);
int batchSize = 1000;
while (checkExportedVCF(originalVcf, variantVcfReader, outputVcf, region, batchSize) != batchSize) {
region = new Region("22", region.getEnd());
}
variantVcfReader.post();
variantVcfReader.close();
}
@Ignore
@Test
public void testVcfExport() throws Exception {
QueryOptions queryOptions = new QueryOptions();
List<String> include = Arrays.asList("chromosome", "start", "end", "alternative", "reference", "ids", "sourceEntries");
queryOptions.add("include", include);
VariantVcfDataWriter.vcfExport(dbAdaptor, studyConfiguration, new URI(EXPORTED_FILE_NAME), new Query(), queryOptions);
// compare VCF_TEST_FILE_NAME and EXPORTED_FILE_NAME
}
public int checkExportedVCF(Path originalVcf, Path exportedVcf, Region region) throws IOException {
return checkExportedVCF(originalVcf, null, exportedVcf, region, null);
}
/**
* @return number of read variants
*/
public int checkExportedVCF(Path originalVcf, VariantVcfReader originalVcfReader, Path exportedVcf, Region region, Integer lim)
throws IOException {
Map<String, Variant> originalVariants;
if (originalVcfReader == null) {
originalVariants = readVCF(originalVcf, lim, region);
} else {
originalVariants = readVCF(originalVcf, lim, region, originalVcfReader);
}
Map<String, Variant> exportedVariants = readVCF(exportedVcf, region);
// assertEquals(originalVariants.size(), exportedVariants.size());
for (Map.Entry<String, Variant> entry : originalVariants.entrySet()) {
Variant originalVariant = entry.getValue();
Variant exportedVariant = exportedVariants.get(entry.getKey());
assertNotNull("At position " + entry.getValue(), originalVariant);
String message = "At original variant: " + originalVariant + ", and exported variant: " + exportedVariant;
assertNotNull(message, exportedVariant);
assertEquals(message, originalVariant.getChromosome(), exportedVariant.getChromosome());
assertEquals(message, originalVariant.getAlternate(), exportedVariant.getAlternate());
assertEquals(message, originalVariant.getReference(), exportedVariant.getReference());
assertEquals(message, originalVariant.getStart(), exportedVariant.getStart());
assertEquals(message, originalVariant.getEnd(), exportedVariant.getEnd());
assertWithConflicts(originalVariant, () -> assertEquals("At variant " + originalVariant, originalVariant.getIds(), exportedVariant.getIds()));
assertEquals(message, originalVariant.getStudies().size(), exportedVariant.getStudies().size());
assertEquals(message, originalVariant.getSampleNames("f", "s"), exportedVariant.getSampleNames("f",
"s"));
StudyEntry originalSourceEntry = originalVariant.getStudy("s");
StudyEntry exportedSourceEntry = exportedVariant.getStudy("s");
for (String sampleName : originalSourceEntry.getSamplesName()) {
assertWithConflicts(exportedVariant, () -> assertEquals("For sample '" + sampleName + "', id "
+ studyConfiguration.getSampleIds().get(sampleName)
+ ", in " + originalVariant,
originalSourceEntry.getSampleData(sampleName, "GT"),
exportedSourceEntry.getSampleData(sampleName, "GT").replace("0/0", "0|0")));
}
}
return originalVariants.size();
}
public Map<String, Variant> readVCF(Path vcfPath, Region region) throws IOException {
return readVCF(vcfPath, null, region);
}
public Map<String, Variant> readVCF(Path vcfPath, Integer lim, Region region) throws IOException {
if (lim == null) {
lim = Integer.MAX_VALUE;
}
if (region == null) {
region = new Region();
}
InputStream is = new FileInputStream(vcfPath.toFile());
if (vcfPath.toString().endsWith(".gz")) {
is = new GZIPInputStream(is);
}
VariantReader variantVcfReader = new VariantVcfHtsjdkReader(is, new VariantSource(vcfPath.getFileName().toString(), "f", "s", ""), new VariantNormalizer());
variantVcfReader.open();
variantVcfReader.pre();
Map<String, Variant> variantMap;
variantMap = readVCF(vcfPath, lim, region, variantVcfReader);
variantVcfReader.post();
variantVcfReader.close();
return variantMap;
}
public Map<String, Variant> readVCF(Path vcfPath, Integer lim, Region region, VariantReader variantVcfReader) {
Map<String, Variant> variantMap;
variantMap = new LinkedHashMap<>();
List<Variant> read;
int lines = 0;
int batchSize = 100;
int start = Integer.MAX_VALUE;
int end = 0;
int variantsToRead = lines + batchSize > lim ? lim - lines : batchSize;
do {
System.err.println("Reading " + variantsToRead + " variants from '" + vcfPath.getFileName().toString() + "' line : " + lines
+ " variants : " + variantMap.size());
read = variantVcfReader.read(variantsToRead);
for (Variant variant : read) {
lines++;
if (variant.getType().equals(VariantType.SYMBOLIC) || variant.getAlternate().startsWith("<")) {
continue;
}
if (variant.getStart() >= region.getStart() && variant.getEnd() <= region.getEnd()) {
start = Math.min(start, variant.getStart());
end = Math.max(end, variant.getEnd());
variantMap.put(variant.toString(), variant);
if (variantMap.size() == lim) {
break;
}
}
}
} while (!read.isEmpty() && variantMap.size() < lim);
region.setStart(start);
region.setEnd(end);
System.out.println("Read " + variantMap.size() + " variants between " + region.toString());
return variantMap;
}
}