/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.hadoop.variant;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.PrefixFilter;
import org.junit.Before;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.rules.ExternalResource;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantSource;
import org.opencb.biodata.models.variant.avro.VariantAnnotation;
import org.opencb.biodata.models.variant.avro.VariantType;
import org.opencb.biodata.models.variant.protobuf.VcfMeta;
import org.opencb.biodata.tools.variant.merge.VariantMerger;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.opencga.storage.core.StoragePipelineResult;
import org.opencb.opencga.storage.core.exceptions.StoragePipelineException;
import org.opencb.opencga.storage.core.metadata.BatchFileOperation;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.metadata.FileStudyConfigurationManager;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor;
import org.opencb.opencga.storage.core.variant.io.VariantVcfDataWriter;
import org.opencb.opencga.storage.hadoop.utils.HBaseManager;
import org.opencb.opencga.storage.hadoop.variant.adaptors.HadoopVariantSourceDBAdaptor;
import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor;
import org.opencb.opencga.storage.hadoop.variant.index.AbstractVariantTableMapReduce;
import org.opencb.opencga.storage.hadoop.variant.converters.HBaseToVariantConverter;
import org.opencb.opencga.storage.hadoop.variant.index.VariantTableMapper;
import org.opencb.opencga.storage.hadoop.variant.metadata.HBaseStudyConfigurationManager;
import org.opencb.opencga.storage.hadoop.variant.models.protobuf.VariantTableStudyRowsProto;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.net.URI;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static org.hamcrest.CoreMatchers.*;
import static org.junit.Assert.*;
import static org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils.printVariants;
import static org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils.printVariantsFromArchiveTable;
import static org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils.printVariantsFromVariantsTable;
/**
* Created on 21/01/16
*
* @author Jacobo Coll <jacobo167@gmail.com>
*/
public class VariantHadoopMultiSampleTest extends VariantStorageBaseTest implements HadoopVariantStorageTest {
@ClassRule
public static ExternalResource externalResource = new HadoopExternalResource();
public static final List<VariantType> VARIANT_TYPES = Arrays.asList(VariantTableMapper.getTargetVariantType());
// Variants that are wrong in the platinum files that should not be included
private static final HashSet<String> PLATINUM_SKIP_VARIANTS = new HashSet<>();
@Before
public void setUp() throws Exception {
HadoopVariantStorageEngine variantStorageManager = getVariantStorageEngine();
clearDB(variantStorageManager.getVariantTableName(DB_NAME));
clearDB(variantStorageManager.getArchiveTableName(STUDY_ID));
//Force HBaseConverter to fail if something goes wrong
HBaseToVariantConverter.setFailOnWrongVariants(true);
}
@Override
public Map<String, ?> getOtherStorageConfigurationOptions() {
return new ObjectMap(AbstractHadoopVariantStoragePipeline.SKIP_CREATE_PHOENIX_INDEXES, true);
}
public VariantSource loadFile(String resourceName, int fileId, StudyConfiguration studyConfiguration) throws Exception {
return loadFile(resourceName, fileId, studyConfiguration, null);
}
public VariantSource loadFile(String resourceName, StudyConfiguration studyConfiguration, Map<? extends String, ?> otherParams) throws Exception {
return loadFile(resourceName, -1, studyConfiguration, otherParams);
}
public VariantSource loadFile(String resourceName, int fileId, StudyConfiguration studyConfiguration, Map<? extends String, ?> otherParams) throws Exception {
return loadFile(resourceName, fileId, studyConfiguration, otherParams, true, true, true);
}
public VariantSource loadFile(String resourceName, int fileId, StudyConfiguration studyConfiguration,
Map<? extends String, ?> otherParams, boolean doTransform, boolean loadArchive, boolean loadVariant)
throws Exception {
return VariantHbaseTestUtils.loadFile(getVariantStorageEngine(), DB_NAME, outputUri, resourceName, fileId, studyConfiguration,
otherParams, doTransform, loadArchive, loadVariant);
}
@Test
public void testTwoFiles() throws Exception {
StudyConfiguration studyConfiguration = VariantStorageBaseTest.newStudyConfiguration();
VariantHadoopDBAdaptor dbAdaptor = getVariantStorageEngine().getDBAdaptor(DB_NAME);
VariantSource source1 = loadFile("s1.genome.vcf", studyConfiguration, Collections.emptyMap());
checkArchiveTableTimeStamp(dbAdaptor);
studyConfiguration = dbAdaptor.getStudyConfigurationManager().getStudyConfiguration(studyConfiguration.getStudyId(), null).first();
VariantSource source2 = loadFile("s2.genome.vcf", studyConfiguration, Collections.emptyMap());
checkArchiveTableTimeStamp(dbAdaptor);
// printVariantsFromArchiveTable(dbAdaptor, studyConfiguration);
printVariants(studyConfiguration, dbAdaptor, newOutputUri());
checkLoadedFilesS1S2(studyConfiguration, dbAdaptor);
}
@Test
public void testTwoFilesConcurrent() throws Exception {
StudyConfiguration studyConfiguration = VariantStorageBaseTest.newStudyConfiguration();
HadoopVariantStorageEngine variantStorageManager = getVariantStorageEngine();
ObjectMap options = variantStorageManager.getConfiguration().getStorageEngine(variantStorageManager.getStorageEngineId()).getVariant().getOptions();
options.put(HadoopVariantStorageEngine.HADOOP_LOAD_DIRECT, true);
options.put(VariantStorageEngine.Options.TRANSFORM_FORMAT.key(), "proto");
options.put(VariantStorageEngine.Options.DB_NAME.key(), DB_NAME);
options.put(VariantStorageEngine.Options.STUDY_ID.key(), studyConfiguration.getStudyId());
options.put(VariantStorageEngine.Options.STUDY_NAME.key(), studyConfiguration.getStudyName());
List<URI> inputFiles = Arrays.asList(getResourceUri("s1.genome.vcf"), getResourceUri("s2.genome.vcf"));
List<StoragePipelineResult> index = variantStorageManager.index(inputFiles, outputUri, true, true, true);
VariantHadoopDBAdaptor dbAdaptor = variantStorageManager.getDBAdaptor(DB_NAME);
studyConfiguration = dbAdaptor.getStudyConfigurationManager().getStudyConfiguration(studyConfiguration.getStudyId(), null).first();
for (StoragePipelineResult storagePipelineResult : index) {
System.out.println(storagePipelineResult);
}
try(PrintStream out = new PrintStream(new FileOutputStream(outputUri.resolve("s1-2.merged.archive.json").getPath()))){
printVariantsFromArchiveTable(dbAdaptor, studyConfiguration, out);
}
for (Variant variant : dbAdaptor) {
System.out.println("variant = " + variant);
}
// checkLoadedFilesS1S2(studyConfiguration, dbAdaptor);
assertThat(studyConfiguration.getIndexedFiles(), hasItems(0, 1));
}
@Test
public void testMultipleFilesProtoConcurrent() throws Exception {
List<URI> protoFiles = new LinkedList<>();
StudyConfiguration studyConfiguration = VariantStorageBaseTest.newStudyConfiguration();
HadoopVariantStorageEngine variantStorageManager = getVariantStorageEngine();
ObjectMap options = variantStorageManager.getConfiguration().getStorageEngine(variantStorageManager.getStorageEngineId()).getVariant().getOptions();
options.put(HadoopVariantStorageEngine.HADOOP_LOAD_ARCHIVE, false);
options.put(HadoopVariantStorageEngine.HADOOP_LOAD_VARIANT, false);
options.put(VariantStorageEngine.Options.TRANSFORM_FORMAT.key(), "proto");
options.put(VariantStorageEngine.Options.DB_NAME.key(), DB_NAME);
options.put(VariantStorageEngine.Options.STUDY_ID.key(), STUDY_ID);
options.put(VariantStorageEngine.Options.STUDY_NAME.key(), STUDY_NAME);
options.put(VariantStorageEngine.Options.FILE_ID.key(), -1);
List<URI> inputFiles = new LinkedList<>();
// for (int fileId = 12877; fileId <= 12893; fileId++) {
for (int fileId = 12877; fileId <= 12879; fileId++) {
String fileName = "platinum/1K.end.platinum-genomes-vcf-NA" + fileId + "_S1.genome.vcf.gz";
// inputFiles.add(getResourceUri(fileName));
List<StoragePipelineResult> results = variantStorageManager.index(Collections.singletonList(getResourceUri(fileName)), outputUri, true, true, false);
protoFiles.add(results.get(0).getTransformResult());
}
// dbAdaptor.getStudyConfigurationManager().updateStudyConfiguration(studyConfiguration, null);
protoFiles = protoFiles.subList(0,2); // TODO remove
options.put(HadoopVariantStorageEngine.HADOOP_LOAD_DIRECT, true);
options.put(HadoopVariantStorageEngine.HADOOP_LOAD_ARCHIVE, true);
options.put(HadoopVariantStorageEngine.HADOOP_LOAD_VARIANT, false);
options.put(HadoopVariantStorageEngine.HADOOP_LOAD_VARIANT_PENDING_FILES, Arrays.asList(5,6,7));
List<StoragePipelineResult> index2 = variantStorageManager.index(protoFiles, outputUri, false, false, true);
System.out.println(index2);
}
@Test
public void testMultipleFilesConcurrentSpecificPut() throws Exception {
testMultipleFilesConcurrent(true);
}
@Test
public void testMultipleFilesConcurrentFullPut() throws Exception {
testMultipleFilesConcurrent(false);
}
public void testMultipleFilesConcurrent(boolean specificput) throws Exception {
StudyConfiguration studyConfiguration = VariantStorageBaseTest.newStudyConfiguration();
HadoopVariantStorageEngine variantStorageManager = getVariantStorageEngine();
VariantHadoopDBAdaptor dbAdaptor = variantStorageManager.getDBAdaptor(DB_NAME);
List<URI> inputFiles = new LinkedList<>();
for (int fileId = 12877; fileId <= 12893; fileId++) {
String fileName = "platinum/1K.end.platinum-genomes-vcf-NA" + fileId + "_S1.genome.vcf.gz";
inputFiles.add(getResourceUri(fileName));
}
ObjectMap options = variantStorageManager.getConfiguration().getStorageEngine(variantStorageManager.getStorageEngineId()).getVariant().getOptions();
options.put(HadoopVariantStorageEngine.HADOOP_LOAD_DIRECT, true);
options.put(VariantStorageEngine.Options.TRANSFORM_FORMAT.key(), "proto");
options.put(VariantStorageEngine.Options.DB_NAME.key(), DB_NAME);
options.put(VariantStorageEngine.Options.STUDY_ID.key(), studyConfiguration.getStudyId());
options.put(VariantStorageEngine.Options.STUDY_NAME.key(), studyConfiguration.getStudyName());
options.put(AbstractVariantTableMapReduce.SPECIFIC_PUT, specificput);
List<StoragePipelineResult> index = variantStorageManager.index(inputFiles, outputUri, true, true, true);
for (StoragePipelineResult storagePipelineResult : index) {
System.out.println(storagePipelineResult);
}
try(PrintStream out = new PrintStream(new FileOutputStream(outputUri.resolve("platinum.merged.archive.json").getPath()))){
printVariantsFromArchiveTable(dbAdaptor, studyConfiguration, out);
}
// checkLoadedVariants(expectedVariants, dbAdaptor, PLATINUM_SKIP_VARIANTS);
for (Variant variant : dbAdaptor) {
System.out.println("variant = " + variant);
}
studyConfiguration = dbAdaptor.getStudyConfigurationManager().getStudyConfiguration(studyConfiguration.getStudyId(), null).first();
System.out.println("StudyConfiguration = " + studyConfiguration);
HadoopVariantSourceDBAdaptor fileMetadataManager = dbAdaptor.getVariantSourceDBAdaptor();
Set<Integer> loadedFiles = fileMetadataManager.getLoadedFiles(studyConfiguration.getStudyId());
System.out.println("loadedFiles = " + loadedFiles);
for (int fileId = 0; fileId <= 16; fileId++) {
assertTrue(loadedFiles.contains(fileId));
}
for (Integer loadedFile : loadedFiles) {
VcfMeta vcfMeta = fileMetadataManager.getVcfMeta(studyConfiguration.getStudyId(), loadedFile, null);
assertNotNull(vcfMeta);
}
URI outputUri = newOutputUri();
FileStudyConfigurationManager.write(studyConfiguration, new File(outputUri.resolve("study_configuration.json").getPath()).toPath());
try (FileOutputStream out = new FileOutputStream(outputUri.resolve("platinum.merged.vcf").getPath())) {
VariantVcfDataWriter.htsExport(dbAdaptor.iterator(new Query(), new QueryOptions(QueryOptions.SORT, true)),
studyConfiguration, dbAdaptor.getVariantSourceDBAdaptor(), out, new QueryOptions());
}
}
@Test
public void testTwoFilesFailOne() throws Exception {
StudyConfiguration studyConfiguration = VariantStorageBaseTest.newStudyConfiguration();
VariantHadoopDBAdaptor dbAdaptor = getVariantStorageEngine().getDBAdaptor(DB_NAME);
try {
VariantSource source1 = loadFile("s1.genome.vcf", studyConfiguration,
Collections.singletonMap(VariantTableMapperFail.SLICE_TO_FAIL, "1_000000000011"));
fail();
} catch (StoragePipelineException e) {
HBaseStudyConfigurationManager scm = (HBaseStudyConfigurationManager) dbAdaptor.getStudyConfigurationManager();
studyConfiguration = scm.getStudyConfiguration(STUDY_ID, new QueryOptions()).first();
System.out.println("studyConfiguration: " + studyConfiguration);
System.out.println(studyConfiguration.getIndexedFiles());
e.printStackTrace();
}
Integer fileId = studyConfiguration.getFileIds().get("s1.genome.vcf");
System.out.println("fileId = " + fileId);
VariantSource source1 = loadFile("s1.genome.vcf.variants.proto.gz", -1, studyConfiguration,
Collections.singletonMap(VariantTableMapperFail.SLICE_TO_FAIL, "_"), false, false, true);
checkArchiveTableTimeStamp(dbAdaptor);
VariantSource source2 = loadFile("s2.genome.vcf", studyConfiguration, Collections.emptyMap());
checkArchiveTableTimeStamp(dbAdaptor);
// printVariants(studyConfiguration, dbAdaptor, newOutputUri());
checkLoadedFilesS1S2(studyConfiguration, dbAdaptor);
assertEquals(2, studyConfiguration.getBatches().size());
BatchFileOperation batch = studyConfiguration.getBatches().get(0);
assertEquals(BatchFileOperation.Status.READY, batch.currentStatus());
assertThat(batch.getStatus().values(), hasItem(BatchFileOperation.Status.ERROR));
batch = studyConfiguration.getBatches().get(1);
assertEquals(BatchFileOperation.Status.READY, batch.currentStatus());
assertThat(batch.getStatus().values(),
not(hasItem(BatchFileOperation.Status.ERROR)));
}
public void checkLoadedFilesS1S2(StudyConfiguration studyConfiguration, VariantHadoopDBAdaptor dbAdaptor) {
System.out.println("studyConfiguration = " + studyConfiguration);
Map<String, Variant> variants = new HashMap<>();
for (Variant variant : dbAdaptor) {
String v = variant.toString();
assertFalse(variants.containsKey(v));
variants.put(v, variant);
VariantAnnotation a = variant.getAnnotation();
variant.setAnnotation(null);
System.out.println(variant.toJson());
variant.setAnnotation(a);
}
String studyName = studyConfiguration.getStudyName();
// TODO: Add more asserts
// TODO: Update with last changes!
/* s1 s2
1 10013 T C 0/1 0/0
1 10014 A T 0/1 0/2
1 10014 A G 0/2 0/1
1 10030 T G 0/0 0/1
1 10031 T G 0/1 0/1
1 10032 A G 0/1 0/0
1 11000 T G 1/1 0/1
1 12000 T G 1/1 0/0
1 13000 T G 0/0 0/1
*/
assertEquals(16, variants.size());
assertTrue(variants.containsKey("1:10013:T:C"));
assertEquals("0/1", variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s1", "GT"));
assertEquals("0/0", variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s2", "GT"));
assertTrue(variants.containsKey("1:10014:A:T"));
assertEquals("0/1", variants.get("1:10014:A:T").getStudy(studyName).getSampleData("s1", "GT"));
assertEquals("0/2", variants.get("1:10014:A:T").getStudy(studyName).getSampleData("s2", "GT"));
assertTrue(variants.containsKey("1:10014:A:G"));
assertEquals("0/2", variants.get("1:10014:A:G").getStudy(studyName).getSampleData("s1", "GT"));
assertEquals("0/1", variants.get("1:10014:A:G").getStudy(studyName).getSampleData("s2", "GT"));
assertTrue(variants.containsKey("1:10030:T:G"));
assertEquals("0/0", variants.get("1:10030:T:G").getStudy(studyName).getSampleData("s1", "GT"));
assertEquals("0/1", variants.get("1:10030:T:G").getStudy(studyName).getSampleData("s2", "GT"));
assertTrue(variants.containsKey("1:10031:T:G"));
assertEquals("0/1", variants.get("1:10031:T:G").getStudy(studyName).getSampleData("s1", "GT"));
assertEquals("0/1", variants.get("1:10031:T:G").getStudy(studyName).getSampleData("s2", "GT"));
assertTrue(variants.containsKey("1:10032:A:G"));
assertEquals("1", variants.get("1:10032:A:G").getStudy(studyName).getFiles().get(0).getAttributes().get("PASS"));
assertEquals("0/1", variants.get("1:10032:A:G").getStudy(studyName).getSampleData("s1", "GT"));
assertEquals("PASS", variants.get("1:10032:A:G").getStudy(studyName).getSampleData("s1", VariantMerger.GENOTYPE_FILTER_KEY));
assertEquals("0/0", variants.get("1:10032:A:G").getStudy(studyName).getSampleData("s2", "GT"));
assertEquals("LowGQX", variants.get("1:10032:A:G").getStudy(studyName).getSampleData("s2", VariantMerger.GENOTYPE_FILTER_KEY));
assertTrue(variants.containsKey("1:11000:T:G"));
assertEquals("1/1", variants.get("1:11000:T:G").getStudy(studyName).getSampleData("s1", "GT"));
assertEquals("0/1", variants.get("1:11000:T:G").getStudy(studyName).getSampleData("s2", "GT"));
assertTrue(variants.containsKey("1:12000:T:G"));
assertEquals("1/1", variants.get("1:12000:T:G").getStudy(studyName).getSampleData("s1", "GT"));
assertEquals(".", variants.get("1:12000:T:G").getStudy(studyName).getSampleData("s1", VariantMerger.GENOTYPE_FILTER_KEY));
assertEquals("0/0", variants.get("1:12000:T:G").getStudy(studyName).getSampleData("s2", "GT"));
assertEquals("HighDPFRatio;LowGQX", variants.get("1:12000:T:G").getStudy(studyName).getSampleData("s2", VariantMerger.GENOTYPE_FILTER_KEY));
assertTrue(variants.containsKey("1:13000:T:G"));
assertEquals("0/0", variants.get("1:13000:T:G").getStudy(studyName).getSampleData("s1", "GT"));
assertEquals("0/1", variants.get("1:13000:T:G").getStudy(studyName).getSampleData("s2", "GT"));
}
@Test
public void testPlatinumFilesOneByOne() throws Exception {
StudyConfiguration studyConfiguration = VariantStorageBaseTest.newStudyConfiguration();
List<VariantSource> sources = new LinkedList<>();
Set<String> expectedVariants = new HashSet<>();
VariantHadoopDBAdaptor dbAdaptor = getVariantStorageEngine().getDBAdaptor(DB_NAME);
HBaseStudyConfigurationManager scm = (HBaseStudyConfigurationManager) dbAdaptor.getStudyConfigurationManager();
int maxFilesLoaded = 3;
for (int fileId = 12877; fileId <= 12893; fileId++) {
VariantSource source = loadFile("platinum/1K.end.platinum-genomes-vcf-NA" + fileId + "_S1.genome.vcf.gz", fileId, studyConfiguration);
studyConfiguration = scm.getStudyConfiguration(studyConfiguration.getStudyId(), new QueryOptions()).first();
System.out.println(studyConfiguration);
Set<String> variants = checkArchiveTableLoadedVariants(studyConfiguration, dbAdaptor, source);
sources.add(source);
expectedVariants.addAll(variants);
assertTrue(studyConfiguration.getIndexedFiles().contains(fileId));
// checkLoadedVariants(expectedVariants, dbAdaptor, PLATINUM_SKIP_VARIANTS);
checkArchiveTableTimeStamp(dbAdaptor);
if (sources.size() >= maxFilesLoaded) {
break;
}
}
printVariantsFromArchiveTable(dbAdaptor, studyConfiguration);
for (Variant variant : dbAdaptor) {
System.out.println("variant = " + variant);
}
System.out.println(studyConfiguration);
checkLoadedVariants(expectedVariants, dbAdaptor, PLATINUM_SKIP_VARIANTS);
}
@Test
public void testPlatinumFilesBatchLoad() throws Exception {
StudyConfiguration studyConfiguration = VariantStorageBaseTest.newStudyConfiguration();
List<VariantSource> sources = new LinkedList<>();
Set<String> expectedVariants = new HashSet<>();
VariantHadoopDBAdaptor dbAdaptor = getVariantStorageEngine().getDBAdaptor(DB_NAME);
List<Integer> fileIds = IntStream.range(12877, 12894).mapToObj(i -> i).collect(Collectors.toList());
for (Integer fileId : fileIds.subList(0,fileIds.size()-1)) {
VariantSource source = loadFile("platinum/1K.end.platinum-genomes-vcf-NA" + fileId + "_S1.genome.vcf.gz", fileId, studyConfiguration,
new ObjectMap(HadoopVariantStorageEngine.HADOOP_LOAD_VARIANT, false));
sources.add(source);
expectedVariants.addAll(checkArchiveTableLoadedVariants(studyConfiguration, dbAdaptor, source));
assertFalse(studyConfiguration.getIndexedFiles().contains(fileId));
}
Integer fileId = fileIds.get(fileIds.size()-1);
VariantSource source = loadFile("platinum/1K.end.platinum-genomes-vcf-NA" + fileId + "_S1.genome.vcf.gz", fileId, studyConfiguration,
new ObjectMap(HadoopVariantStorageEngine.HADOOP_LOAD_VARIANT, true)
.append(HadoopVariantStorageEngine.HADOOP_LOAD_VARIANT_PENDING_FILES, StringUtils.join(fileIds, ","))
);
sources.add(source);
expectedVariants.addAll(checkArchiveTableLoadedVariants(studyConfiguration, dbAdaptor, source));
HBaseStudyConfigurationManager scm = (HBaseStudyConfigurationManager) dbAdaptor.getStudyConfigurationManager();
studyConfiguration = scm.getStudyConfiguration(studyConfiguration.getStudyId(), new QueryOptions()).first();
System.out.println("studyConfiguration = " + studyConfiguration.getAttributes().toJson());
System.out.println("HBaseStudyConfiguration = " + studyConfiguration);
for (fileId = 12877; fileId <= 12893; fileId++) {
assertTrue(studyConfiguration.getIndexedFiles().contains(fileId));
}
for (Variant variant : dbAdaptor) {
System.out.println(variant);
}
// printVariants(studyConfiguration, dbAdaptor, newOutputUri());
checkArchiveTableTimeStamp(dbAdaptor);
checkLoadedVariants(expectedVariants, dbAdaptor, PLATINUM_SKIP_VARIANTS);
}
public void checkLoadedVariants(Set<String> expectedVariants, VariantHadoopDBAdaptor dbAdaptor, HashSet<String> platinumSkipVariants)
throws IOException {
long count = dbAdaptor.count(null).first();
expectedVariants.removeAll(platinumSkipVariants);
System.out.println("count = " + count);
System.out.println("expectedVariants = " + expectedVariants.size());
if (expectedVariants.size() != count) {
Set<String> loadedVariants = new HashSet<>();
for (Variant variant : dbAdaptor) {
loadedVariants.add(variant.toString());
if (!expectedVariants.contains(variant.toString())) {
System.out.println("unexpectedVariant: " + variant);
}
}
for (String expectedVariant : expectedVariants) {
if (!loadedVariants.contains(expectedVariant)) {
System.out.println("Missing variant: " + expectedVariant);
}
}
printVariantsFromVariantsTable(dbAdaptor);
}
assertEquals(expectedVariants.size(), count);
count = 0;
for (Variant variant : dbAdaptor) {
count++;
assertTrue(expectedVariants.contains(variant.toString()));
}
assertEquals(expectedVariants.size(), count);
}
public Set<String> checkArchiveTableLoadedVariants(StudyConfiguration studyConfiguration, VariantHadoopDBAdaptor dbAdaptor,
VariantSource source) {
int fileId = Integer.valueOf(source.getFileId());
Set<String> variants = getVariants(dbAdaptor, studyConfiguration, fileId);
int expected = source.getStats().getVariantTypeCounts().entrySet().stream()
.filter(entry -> VARIANT_TYPES.contains(VariantType.valueOf(entry.getKey())))
.map(Map.Entry::getValue).reduce((i1, i2) -> i1 + i2).orElse(0).intValue();
assertEquals(expected, variants.size());
return variants;
}
protected Set<String> getVariants(VariantHadoopDBAdaptor dbAdaptor, StudyConfiguration studyConfiguration, int fileId){
// Map<String, Integer> variantCounts = new HashMap<>();
Set<String> variants = new HashSet<>();
Set<String> observed = new HashSet<>(Arrays.asList("M:516:-:CA", "1:10231:C:-", "1:10352:T:A", "M:515:G:A"));
System.out.println("Query from Archive table");
dbAdaptor.iterator(
new Query()
.append(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), studyConfiguration.getStudyId())
.append(VariantDBAdaptor.VariantQueryParams.FILES.key(), fileId),
new QueryOptions("archive", true))
.forEachRemaining(variant -> {
if (VARIANT_TYPES.contains(variant.getType())) {
String string = variant.toString();
if (observed.contains(string)) {
System.out.println("Variant " + string + " found in file " + fileId);
}
variants.add(string);
}
// variantCounts.compute(variant.getType().toString(), (s, integer) -> integer == null ? 1 : (integer + 1));
});
return variants;
}
protected void checkArchiveTableTimeStamp(VariantHadoopDBAdaptor dbAdaptor) throws Exception {
HBaseStudyConfigurationManager scm = (HBaseStudyConfigurationManager) dbAdaptor.getStudyConfigurationManager();
StudyConfiguration studyConfiguration = scm.getStudyConfiguration(STUDY_ID, new QueryOptions()).first();
String tableName = HadoopVariantStorageEngine.getArchiveTableName(STUDY_ID, dbAdaptor.getConfiguration());
System.out.println("Query from archive HBase " + tableName);
HBaseManager hm = new HBaseManager(configuration.get());
GenomeHelper helper = dbAdaptor.getGenomeHelper();
long ts = studyConfiguration.getBatches().get(studyConfiguration.getBatches().size() - 1).getTimestamp();
hm.act(tableName, table -> {
Scan scan = new Scan();
scan.setFilter(new PrefixFilter(GenomeHelper.VARIANT_COLUMN_B_PREFIX));
ResultScanner resultScanner = table.getScanner(scan);
for (Result result : resultScanner) {
List<Cell> cells = GenomeHelper.getVariantColumns(result.rawCells());
assertNotEquals(0, cells.size());
for (Cell cell : cells) {
VariantTableStudyRowsProto proto = VariantTableStudyRowsProto.parseFrom(CellUtil.cloneValue(cell));
assertEquals(ts, proto.getTimestamp());
}
}
resultScanner.close();
return null;
});
}
}