/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*/
package org.opencb.opencga.storage.hadoop.variant;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.NamespaceDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.phoenix.schema.IllegalDataException;
import org.apache.phoenix.schema.types.*;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantSource;
import org.opencb.biodata.models.variant.protobuf.VcfSliceProtos;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.opencga.core.common.TimeUtils;
import org.opencb.opencga.storage.core.StoragePipelineResult;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.metadata.FileStudyConfigurationManager;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBIterator;
import org.opencb.opencga.storage.core.variant.annotation.VariantAnnotationManager;
import org.opencb.opencga.storage.hadoop.utils.HBaseManager;
import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor;
import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveHelper;
import org.opencb.opencga.storage.hadoop.variant.archive.VariantHadoopArchiveDBIterator;
import org.opencb.opencga.storage.hadoop.variant.index.VariantTableStudyRow;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.PhoenixHelper;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.VariantPhoenixHelper;
import org.opencb.opencga.storage.hadoop.variant.models.protobuf.VariantTableStudyRowsProto;
import java.io.*;
import java.net.URI;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Map;
import static org.opencb.opencga.storage.core.variant.VariantStorageBaseTest.getTmpRootDir;
import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageTest.configuration;
/**
* Utility class for VariantStorage hadoop tests
*
* @author Matthias Haimel mh719+git@cam.ac.uk
*/
public class VariantHbaseTestUtils {
public static VariantHadoopDBAdaptor printVariantsFromArchiveTable(VariantHadoopDBAdaptor dbAdaptor,
StudyConfiguration studyConfiguration) throws Exception {
return printVariantsFromArchiveTable(dbAdaptor, studyConfiguration, System.out);
}
public static VariantHadoopDBAdaptor printVariantsFromArchiveTable(VariantHadoopDBAdaptor dbAdaptor,
StudyConfiguration studyConfiguration, Path output) throws Exception {
if (output.toFile().isDirectory()) {
String archiveTableName = HadoopVariantStorageEngine.getArchiveTableName(studyConfiguration.getStudyId(), dbAdaptor.getConfiguration());
output = output.resolve("archive._V." + archiveTableName + "." + TimeUtils.getTimeMillis() + ".txt");
}
try (FileOutputStream out = new FileOutputStream(output.toFile())) {
return printVariantsFromArchiveTable(dbAdaptor, studyConfiguration, new PrintStream(out));
}
}
public static VariantHadoopDBAdaptor printVariantsFromArchiveTable(VariantHadoopDBAdaptor dbAdaptor,
StudyConfiguration studyConfiguration, PrintStream out) throws Exception {
GenomeHelper helper = dbAdaptor.getGenomeHelper();
helper.getHBaseManager().act(HadoopVariantStorageEngine.getArchiveTableName(studyConfiguration.getStudyId(), dbAdaptor.getConfiguration()), table -> {
for (Result result : table.getScanner(helper.getColumnFamily())) {
GenomeHelper.getVariantColumns(result.rawCells()).stream()
.filter(c -> Bytes.startsWith(CellUtil.cloneFamily(c), helper.getColumnFamily()))
.forEach(c -> {
try {
byte[] value = CellUtil.cloneValue(c);
if (value != null) {
out.println(VariantTableStudyRowsProto.parseFrom(value));
}
} catch (Exception e) {
out.println("e.getMessage() = " + e.getMessage());
}
});
}
return 0;
});
return dbAdaptor;
}
public static void printVariantsFromVariantsTable(VariantHadoopDBAdaptor dbAdaptor) throws IOException {
printVariantsFromVariantsTable(dbAdaptor, getTmpRootDir());
}
public static void printVariantsFromVariantsTable(VariantHadoopDBAdaptor dbAdaptor, Path dir) throws IOException {
String tableName = HadoopVariantStorageEngine.getVariantTableName(VariantStorageBaseTest.DB_NAME, dbAdaptor.getConfiguration());
HBaseManager hm = new HBaseManager(configuration.get());
if (!hm.tableExists(tableName)) {
System.out.println("Table " + tableName + " does not exist!");
return;
}
System.out.println("Query from HBase : " + tableName);
GenomeHelper genomeHelper = dbAdaptor.getGenomeHelper();
Path outputFile;
if (dir.toFile().isDirectory()) {
outputFile = dir.resolve("variant." + tableName + "." + TimeUtils.getTimeMillis() + ".txt");
} else {
outputFile = dir;
}
System.out.println("Variant table file = " + outputFile);
PrintStream os = new PrintStream(new FileOutputStream(outputFile.toFile()));
int numVariants = hm.act(tableName, table -> {
int num = 0;
ResultScanner resultScanner = table.getScanner(genomeHelper.getColumnFamily());
for (Result result : resultScanner) {
if (Bytes.toString(result.getRow()).startsWith(genomeHelper.getMetaRowKeyString())) {
continue;
}
Variant variant = genomeHelper.extractVariantFromVariantRowKey(result.getRow());
os.println("Variant = " + variant);
for (Map.Entry<byte[], byte[]> entry : result.getFamilyMap(genomeHelper.getColumnFamily()).entrySet()) {
String key = Bytes.toString(entry.getKey());
PhoenixHelper.Column column = VariantPhoenixHelper.VariantColumn.getColumn(key);
if (column != null) {
os.println("\t" + key + " = " + length(entry.getValue()) + ", "
+ column.getPDataType().toObject(entry.getValue()));
} else if (key.endsWith(VariantPhoenixHelper.STATS_PROTOBUF_SUFIX)
|| key.endsWith("_" + VariantTableStudyRow.FILTER_OTHER)
|| key.endsWith("_" + VariantTableStudyRow.COMPLEX)) {
os.println("\t" + key + " = " + length(entry.getValue()) + ", " + Arrays.toString(entry.getValue()));
} else if (key.startsWith(VariantPhoenixHelper.POPULATION_FREQUENCY_PREFIX)) {
os.println("\t" + key + " = " + length(entry.getValue()) + ", " + PFloatArray.INSTANCE.toObject(entry.getValue()));
} else if (key.endsWith("_" + VariantTableStudyRow.HET_REF)
|| key.endsWith("_" + VariantTableStudyRow.HOM_VAR)
|| key.endsWith("_" + VariantTableStudyRow.NOCALL)
|| key.endsWith("_" + VariantTableStudyRow.OTHER)) {
os.println("\t" + key + " = " + PUnsignedIntArray.INSTANCE.toObject(entry.getValue()));
} else if (key.endsWith("_" + VariantTableStudyRow.HOM_REF)
|| key.endsWith("_" + VariantTableStudyRow.CALL_CNT)
|| key.endsWith("_" + VariantTableStudyRow.PASS_CNT)) {
os.println("\t" + key + " = " + PUnsignedInt.INSTANCE.toObject(entry.getValue()));
} else if (key.endsWith(VariantPhoenixHelper.MAF_SUFIX)
|| key.endsWith(VariantPhoenixHelper.MGF_SUFIX)) {
os.println("\t" + key + " = " + PFloat.INSTANCE.toObject(entry.getValue()));
} else if (entry.getValue().length == 4) {
Object o = null;
try {
o = PUnsignedInt.INSTANCE.toObject(entry.getValue());
} catch (IllegalDataException ignore) {}
os.println("\t" + key + " = "
+ PInteger.INSTANCE.toObject(entry.getValue()) + " , "
+ o + " , "
+ PFloat.INSTANCE.toObject(entry.getValue()) + " , ");
} else {
os.println("\t" + key + " ~ " + length(entry.getValue()) + ", "
+ Bytes.toString(entry.getValue()));
}
}
os.println("--------------------");
if (!variant.getChromosome().equals(genomeHelper.getMetaRowKeyString())) {
num++;
}
}
os.close();
resultScanner.close();
return num;
});
}
private static String length(byte[] array) {
return "(" + array.length + " B)";
}
private static void printVariantsFromDBAdaptor(VariantHadoopDBAdaptor dbAdaptor, Path dir) throws IOException {
String tableName = HadoopVariantStorageEngine.getVariantTableName(VariantStorageBaseTest.DB_NAME, dbAdaptor.getConfiguration());
Path outputFile;
if (dir.toFile().isDirectory()) {
outputFile = dir.resolve("variant." + tableName + "." + TimeUtils.getTimeMillis() + ".json");
} else {
outputFile = dir;
}
System.out.println("Variant table file = " + outputFile);
try (OutputStream os = new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))) {
PrintStream out = new PrintStream(os);
printVariantsFromDBAdaptor(dbAdaptor, out);
}
}
private static void printVariantsFromDBAdaptor(VariantHadoopDBAdaptor dbAdaptor, PrintStream out) {
VariantDBIterator iterator = dbAdaptor.iterator(new Query(), new QueryOptions("simpleGenotypes", true));
while (iterator.hasNext()) {
Variant variant = iterator.next();
out.println(variant.toJson());
}
}
public static void printArchiveTable(StudyConfiguration studyConfiguration, VariantHadoopDBAdaptor dbAdaptor, Path outDir) throws Exception {
String archiveTableName = HadoopVariantStorageEngine.getArchiveTableName(studyConfiguration.getStudyId(), dbAdaptor.getConfiguration());
for (Integer fileId : studyConfiguration.getIndexedFiles()) {
try (OutputStream os = new FileOutputStream(outDir.resolve("archive." + fileId + "." + archiveTableName + "." + TimeUtils.getTimeMillis() + ".txt").toFile())) {
printArchiveTable(dbAdaptor, studyConfiguration, fileId, os);
}
}
}
public static void printArchiveTable(VariantHadoopDBAdaptor dbAdaptor,
StudyConfiguration studyConfiguration, int fileId, OutputStream os) throws Exception {
VariantHadoopArchiveDBIterator archive = (VariantHadoopArchiveDBIterator) dbAdaptor.iterator(
new Query()
.append(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), studyConfiguration.getStudyId())
.append(VariantDBAdaptor.VariantQueryParams.FILES.key(), fileId),
new QueryOptions("archive", true));
ArchiveHelper archiveHelper = dbAdaptor.getArchiveHelper(studyConfiguration.getStudyId(), fileId);
for (Result result : archive.getResultScanner()) {
byte[] value = result.getValue(archiveHelper.getColumnFamily(), archiveHelper.getColumn());
VcfSliceProtos.VcfSlice vcfSlice = VcfSliceProtos.VcfSlice.parseFrom(value);
os.write(vcfSlice.toString().getBytes());
}
}
public static void printTables(Configuration conf) throws IOException {
System.out.println("Print tables!");
System.out.println("conf.get(HConstants.ZOOKEEPER_QUORUM) = " + conf.get(HConstants.ZOOKEEPER_QUORUM));
try (Connection con = ConnectionFactory.createConnection(conf)) {
HBaseManager.act(con, "all", (table, admin) -> {
for (NamespaceDescriptor ns : admin.listNamespaceDescriptors()) {
System.out.println(ns.getName());
for (TableName tableName : admin.listTableNamesByNamespace(ns.getName())) {
System.out.println(" " + tableName);
}
System.out.println("---");
}
return null;
});
}
}
public static void printVariants(StudyConfiguration studyConfiguration, VariantHadoopDBAdaptor dbAdaptor, URI outDir) throws Exception {
printVariants(studyConfiguration, dbAdaptor, Paths.get(outDir));
}
public static void printVariants(StudyConfiguration studyConfiguration, VariantHadoopDBAdaptor dbAdaptor, Path outDir) throws Exception {
FileStudyConfigurationManager.write(studyConfiguration, outDir.resolve("study_configuration.json"));
printVariantsFromArchiveTable(dbAdaptor, studyConfiguration, outDir);
printVariantsFromVariantsTable(dbAdaptor, outDir);
printVariantsFromDBAdaptor(dbAdaptor, outDir);
printArchiveTable(studyConfiguration, dbAdaptor, outDir);
}
public static void removeFile(HadoopVariantStorageEngine variantStorageManager, String dbName, int fileId,
StudyConfiguration studyConfiguration, Map<? extends String, ?> otherParams) throws Exception {
ObjectMap params = new ObjectMap(VariantStorageEngine.Options.STUDY_CONFIGURATION.key(), studyConfiguration)
.append(VariantStorageEngine.Options.STUDY_ID.key(), studyConfiguration.getStudyId())
.append(VariantStorageEngine.Options.DB_NAME.key(), dbName);
if (otherParams != null) {
params.putAll(otherParams);
}
variantStorageManager.getConfiguration().getStorageEngine(variantStorageManager.getStorageEngineId()).getVariant().getOptions()
.putAll(params);
variantStorageManager.dropFile(studyConfiguration.getStudyName(), fileId);
studyConfiguration.copy(
variantStorageManager
.getDBAdaptor()
.getStudyConfigurationManager()
.getStudyConfiguration(studyConfiguration.getStudyId(), null)
.first());
// return variantStorageManager.readVariantSource(etlResult.getTransformResult(), new ObjectMap());
}
public static VariantSource loadFile(HadoopVariantStorageEngine variantStorageManager, String dbName, URI outputUri,
String resourceName, int fileId, StudyConfiguration studyConfiguration, Map<? extends String, ?> otherParams, boolean doTransform, boolean loadArchive, boolean loadVariant) throws Exception {
URI fileInputUri = VariantStorageBaseTest.getResourceUri(resourceName);
ObjectMap params = new ObjectMap(VariantStorageEngine.Options.TRANSFORM_FORMAT.key(), "proto")
.append(VariantStorageEngine.Options.STUDY_CONFIGURATION.key(), studyConfiguration)
.append(VariantStorageEngine.Options.STUDY_ID.key(), studyConfiguration.getStudyId())
.append(VariantStorageEngine.Options.STUDY_NAME.key(), studyConfiguration.getStudyName())
.append(VariantStorageEngine.Options.DB_NAME.key(), dbName).append(VariantStorageEngine.Options.ANNOTATE.key(), false)
.append(VariantAnnotationManager.SPECIES, "hsapiens").append(VariantAnnotationManager.ASSEMBLY, "GRch37")
.append(VariantStorageEngine.Options.CALCULATE_STATS.key(), false)
.append(HadoopVariantStorageEngine.HADOOP_LOAD_DIRECT, true)
.append(HadoopVariantStorageEngine.HADOOP_LOAD_ARCHIVE, loadArchive)
.append(HadoopVariantStorageEngine.HADOOP_LOAD_VARIANT, loadVariant);
if (otherParams != null) {
params.putAll(otherParams);
}
if (fileId > 0) {
params.append(VariantStorageEngine.Options.FILE_ID.key(), fileId);
}
StoragePipelineResult etlResult = VariantStorageBaseTest.runETL(variantStorageManager, fileInputUri, outputUri, params, doTransform, doTransform, true);
StudyConfiguration updatedStudyConfiguration = variantStorageManager.getDBAdaptor().getStudyConfigurationManager().getStudyConfiguration(studyConfiguration.getStudyId(), null).first();
studyConfiguration.copy(updatedStudyConfiguration);
return variantStorageManager.readVariantSource(doTransform ? etlResult.getTransformResult() : etlResult.getInput());
}
public static VariantSource loadFile(HadoopVariantStorageEngine variantStorageManager, String dbName, URI outputUri,
String resourceName, int fileId, StudyConfiguration studyConfiguration) throws Exception {
return loadFile(variantStorageManager, dbName, outputUri, resourceName, fileId, studyConfiguration, null, true, true, true);
}
public static VariantSource loadFile(HadoopVariantStorageEngine variantStorageManager, String dbName, URI outputUri,
String resourceName, StudyConfiguration studyConfiguration, Map<? extends String, ?> otherParams) throws Exception {
return loadFile(variantStorageManager, dbName, outputUri, resourceName, -1, studyConfiguration, otherParams, true, true, true);
}
}