/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.hadoop.variant; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.CellUtil; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.util.Bytes; import org.junit.*; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantSource; import org.opencb.biodata.models.variant.avro.ConsequenceType; import org.opencb.biodata.models.variant.protobuf.VcfSliceProtos; import org.opencb.biodata.models.variant.stats.VariantGlobalStats; import org.opencb.biodata.tools.variant.converters.proto.VcfSliceToVariantListConverter; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.datastore.core.QueryResult; import org.opencb.opencga.storage.core.StoragePipelineResult; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.core.variant.VariantStorageEngine.Options; import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest; import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor; import org.opencb.opencga.storage.core.variant.adaptors.VariantDBIterator; import org.opencb.opencga.storage.hadoop.utils.HBaseManager; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveHelper; import org.opencb.opencga.storage.hadoop.variant.index.VariantTableMapper; import org.opencb.opencga.storage.hadoop.variant.models.protobuf.VariantTableStudyRowsProto; import java.io.IOException; import java.net.URI; import java.util.*; import static org.junit.Assert.assertEquals; /** * Created on 15/10/15 * * @author Jacobo Coll <jacobo167@gmail.com> */ public class VariantHadoopManagerTest extends VariantStorageBaseTest implements HadoopVariantStorageTest { private VariantHadoopDBAdaptor dbAdaptor; private static StudyConfiguration studyConfiguration; private static VariantSource source; private static StoragePipelineResult etlResult = null; @ClassRule public static HadoopExternalResource externalResource = new HadoopExternalResource(); private QueryResult<Variant> allVariantsQueryResult; @BeforeClass public static void beforeClass() throws Exception { HadoopVariantStorageEngine variantStorageManager = externalResource.getVariantStorageEngine(); externalResource.clearDB(variantStorageManager.getVariantTableName(DB_NAME)); externalResource.clearDB(variantStorageManager.getArchiveTableName(STUDY_ID)); URI inputUri = VariantStorageBaseTest.getResourceUri("sample1.genome.vcf"); // URI inputUri = VariantStorageManagerTestUtils.getResourceUri("variant-test-file.vcf.gz"); studyConfiguration = VariantStorageBaseTest.newStudyConfiguration(); etlResult = VariantStorageBaseTest.runDefaultETL(inputUri, variantStorageManager, studyConfiguration, new ObjectMap(Options.TRANSFORM_FORMAT.key(), "proto") .append(Options.FILE_ID.key(), FILE_ID) .append(Options.ANNOTATE.key(), true) .append(Options.CALCULATE_STATS.key(), false) .append(HadoopVariantStorageEngine.HADOOP_LOAD_DIRECT, true) .append(HadoopVariantStorageEngine.HADOOP_LOAD_ARCHIVE, true) .append(HadoopVariantStorageEngine.HADOOP_LOAD_VARIANT, true) ); source = variantStorageManager.readVariantSource(etlResult.getTransformResult()); VariantGlobalStats stats = source.getStats(); Assert.assertNotNull(stats); try (VariantHadoopDBAdaptor dbAdaptor = variantStorageManager.getDBAdaptor(DB_NAME)) { VariantHbaseTestUtils.printVariantsFromVariantsTable(dbAdaptor); VariantHbaseTestUtils.printVariantsFromArchiveTable(dbAdaptor, studyConfiguration); } } @Before @Override public void before() throws Exception { dbAdaptor = ((HadoopVariantStorageEngine) variantStorageManager).getDBAdaptor(DB_NAME); if (allVariantsQueryResult == null) { allVariantsQueryResult = dbAdaptor.get(new Query(), new QueryOptions()); } } @After public void tearDown() throws Exception { dbAdaptor.close(); } @Test public void testConnection() throws StorageEngineException { variantStorageManager.testConnection(); } @Test public void queryVariantTable() { System.out.println("Query from Variant table"); VariantDBIterator iterator = dbAdaptor.iterator( new Query(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), studyConfiguration.getStudyId()), new QueryOptions()); while (iterator.hasNext()) { Variant variant = iterator.next(); System.out.println("Phoenix variant = " + variant); } System.out.println("End query from Analysis table"); } @Test public void countVariants() { long totalCount = dbAdaptor.count(new Query()).first(); long partialCount1 = dbAdaptor.count(new Query(VariantDBAdaptor.VariantQueryParams.REGION.key(), "1:1-15030")).first(); long partialCount2 = dbAdaptor.count(new Query(VariantDBAdaptor.VariantQueryParams.REGION.key(), "1:15030-60000")).first(); long count = Arrays.stream(VariantTableMapper.getTargetVariantType()) .map(type -> source.getStats().getVariantTypeCount(type)) .reduce((a, b) -> a + b) .orElse(0).longValue(); count -= 1; // Deletion is in conflict with other variant: 1:10403:ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC:A assertEquals(count, totalCount); assertEquals(totalCount, partialCount1 + partialCount2); } @Test public void getVariantByGene() { // Group by Gene HashMap<String, Long> genesCount = new HashMap<>(); for (Variant variant : allVariantsQueryResult.getResult()) { HashSet<String> genesInVariant = new HashSet<>(); for (ConsequenceType consequenceType : variant.getAnnotation().getConsequenceTypes()) { String geneName = consequenceType.getGeneName(); if (geneName != null) { genesInVariant.add(geneName); } geneName = consequenceType.getEnsemblGeneId(); if (geneName != null) { genesInVariant.add(geneName); } } for (String geneName : genesInVariant) { genesCount.put(geneName, genesCount.getOrDefault(geneName, 0L) + 1); } } System.out.println("genesCount = " + genesCount); //Count for each gene for (Map.Entry<String, Long> entry : genesCount.entrySet()) { System.out.println("Gene " + entry.getKey() + " in " + entry.getValue() + " variants"); QueryResult<Long> queryResult = dbAdaptor.count(new Query(VariantDBAdaptor.VariantQueryParams.GENE.key(), entry.getKey())); System.out.println("queryResult.getDbTime() = " + queryResult.getDbTime()); long count = queryResult.first(); assertEquals(entry.getValue().longValue(), count); } } @Test public void queryArchiveTable() { final int[] numVariants = {0}; Map<String, Integer> variantCounts = new HashMap<>(); System.out.println("Query from Archive table"); dbAdaptor.iterator( new Query() .append(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), studyConfiguration.getStudyId()) .append(VariantDBAdaptor.VariantQueryParams.FILES.key(), FILE_ID), new QueryOptions("archive", true)).forEachRemaining(variant -> { System.out.println("Variant from archive = " + variant.toJson()); numVariants[0]++; variantCounts.compute(variant.getType().toString(), (s, integer) -> integer == null ? 1 : (integer + 1)); }); System.out.println("End query from Archive table"); source.getStats().getVariantTypeCounts().forEach((s, integer) -> assertEquals(integer, variantCounts.getOrDefault(s, 0))); assertEquals(source.getStats().getNumRecords(), numVariants[0]); } @Test public void checkVariantTable() throws IOException { System.out.println("Query from HBase : " + DB_NAME); HBaseManager hm = new HBaseManager(configuration.get()); GenomeHelper genomeHelper = dbAdaptor.getGenomeHelper(); int numVariants = hm.act(DB_NAME, table -> { int num = 0; ResultScanner resultScanner = table.getScanner(genomeHelper.getColumnFamily()); for (Result result : resultScanner) { if (Bytes.toString(result.getRow()).startsWith(genomeHelper.getMetaRowKeyString())) { continue; } Variant variant = genomeHelper.extractVariantFromVariantRowKey(result.getRow()); System.out.println("Variant = " + variant); if (!variant.getChromosome().equals(genomeHelper.getMetaRowKeyString())) { num++; } } resultScanner.close(); return num; }); System.out.println("End query from HBase : " + DB_NAME); System.out.println(source.getStats().getVariantTypeCounts()); long count = Arrays.stream(VariantTableMapper.getTargetVariantType()) .map(type -> source.getStats().getVariantTypeCount(type)) .reduce((a, b) -> a + b).orElse(0).longValue(); count -= 1; // Deletion is in conflict with other variant: 1:10403:ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC:A assertEquals(count, numVariants); } @Test public void checkArchiveTable() throws Exception { String tableName = getVariantStorageEngine().getArchiveTableName(STUDY_ID); System.out.println("Query from archive HBase " + tableName); HBaseManager hm = new HBaseManager(configuration.get()); GenomeHelper genomeHelper = dbAdaptor.getGenomeHelper(); ArchiveHelper archiveHelper = dbAdaptor.getArchiveHelper(studyConfiguration.getStudyId(), FILE_ID); VcfSliceToVariantListConverter converter = new VcfSliceToVariantListConverter(archiveHelper.getMeta()); hm.act(tableName, table -> { ResultScanner resultScanner = table.getScanner(genomeHelper.getColumnFamily()); for (Result result : resultScanner) { System.out.println("VcfSlice = " + Bytes.toString(result.getRow())); if (Arrays.equals(result.getRow(), archiveHelper.getMetaRowKey())) { continue; } byte[] value = result.getValue(archiveHelper.getColumnFamily(), archiveHelper.getColumn()); VcfSliceProtos.VcfSlice vcfSlice = VcfSliceProtos.VcfSlice.parseFrom( value); System.out.println(vcfSlice); List<Variant> variants = converter.convert(vcfSlice); for (Variant variant : variants) { System.out.println(variant.toJson()); } List<Cell> cells = GenomeHelper.getVariantColumns(result.rawCells()); if (!cells.isEmpty()) { for (Cell cell : cells) { value = CellUtil.cloneValue(cell); VariantTableStudyRowsProto proto = VariantTableStudyRowsProto.parseFrom(value); String column = Bytes.toString(CellUtil.cloneQualifier(cell)); System.out.println(column + " ts:" + proto.getTimestamp() + " value: " + proto); } } } resultScanner.close(); return null; }); System.out.println("End query from archive HBase " + tableName); } @Test public void checkMeta() throws Exception { System.out.println("Get studies"); List<String> studyNames = dbAdaptor.getStudyConfigurationManager().getStudyNames(new QueryOptions()); assertEquals(1, studyNames.size()); for (String studyName : studyNames) { System.out.println("studyName = " + studyName); StudyConfiguration sc = dbAdaptor.getStudyConfigurationManager().getStudyConfiguration(studyName, new QueryOptions()).first(); assertEquals(sc.getStudyId(), STUDY_ID); assertEquals(sc.getStudyName(), STUDY_NAME); assertEquals(Collections.singleton(FILE_ID), sc.getIndexedFiles()); System.out.println("sc = " + sc); } } }