/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.hadoop.variant.index;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionConfiguration;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.ColumnPrefixFilter;
import org.apache.hadoop.hbase.filter.ColumnRangeFilter;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.io.compress.Compression;
import org.apache.hadoop.hbase.mapreduce.MultiTableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.phoenix.util.SchemaUtil;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryResult;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.hadoop.utils.HBaseManager;
import org.opencb.opencga.storage.hadoop.variant.GenomeHelper;
import org.opencb.opencga.storage.hadoop.variant.metadata.HBaseStudyConfigurationManager;
import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveDriver;
import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveHelper;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.VariantPhoenixHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine.OPENCGA_STORAGE_HADOOP_MAPREDUCE_SCANNER_TIMEOUT;
/**
* @author Matthias Haimel mh719+git@cam.ac.uk
*
*/
public abstract class AbstractVariantTableDriver extends Configured implements Tool {
private static final Logger LOG = LoggerFactory.getLogger(AbstractVariantTableDriver.class);
public static final String CONFIG_VARIANT_FILE_IDS = "opencga.variant.input.file_ids";
public static final String CONFIG_VARIANT_TABLE_NAME = "opencga.variant.table.name";
public static final String CONFIG_VARIANT_TABLE_COMPRESSION = "opencga.variant.table.compression";
public static final String CONFIG_VARIANT_TABLE_PRESPLIT_SIZE = "opencga.variant.table.presplit.size";
public static final String TIMESTAMP = "opencga.variant.table.timestamp";
public static final String HBASE_KEYVALUE_SIZE_MAX = "hadoop.load.variant.hbase.client.keyvalue.maxsize";
public static final String HBASE_SCAN_CACHING = "hadoop.load.variant.scan.caching";
private VariantTableHelper variantTablehelper;
protected HBaseStudyConfigurationManager scm;
protected StudyConfiguration studyConfiguration;
public AbstractVariantTableDriver() { /* nothing */ }
/**
* @param conf Configuration.
*/
public AbstractVariantTableDriver(Configuration conf) {
super(conf);
}
@SuppressWarnings ("rawtypes")
protected abstract Class<? extends TableMapper> getMapperClass();
@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
int maxKeyValueSize = conf.getInt(HBASE_KEYVALUE_SIZE_MAX, 10485760); // 10MB
getLog().info("HBASE: set " + ConnectionConfiguration.MAX_KEYVALUE_SIZE_KEY + " to " + maxKeyValueSize);
conf.setInt(ConnectionConfiguration.MAX_KEYVALUE_SIZE_KEY, maxKeyValueSize); // always overwrite server default (usually 1MB)
String inTable = conf.get(ArchiveDriver.CONFIG_ARCHIVE_TABLE_NAME, StringUtils.EMPTY);
String outTable = conf.get(CONFIG_VARIANT_TABLE_NAME, StringUtils.EMPTY);
String[] fileArr = argFileArray();
Integer studyId = conf.getInt(GenomeHelper.CONFIG_STUDY_ID, -1);
/* -------------------------------*/
// Validate parameters CHECK
if (StringUtils.isEmpty(inTable)) {
throw new IllegalArgumentException("No input hbase table basename specified!!!");
}
if (StringUtils.isEmpty(outTable)) {
throw new IllegalArgumentException("No output hbase table specified!!!");
}
if (inTable.equals(outTable)) {
throw new IllegalArgumentException("Input and Output tables must be different");
}
if (studyId < 0) {
throw new IllegalArgumentException("No Study id specified!!!");
}
int fileCnt = fileArr.length;
if (fileCnt == 0) {
throw new IllegalArgumentException("No files specified");
}
List<Integer> fileIds = new ArrayList<>(fileArr.length);
for (String fileIdStr : fileArr) {
int id = Integer.parseInt(fileIdStr);
fileIds.add(id);
}
getLog().info(String.format("Use table %s as input", inTable));
GenomeHelper.setStudyId(conf, studyId);
VariantTableHelper.setOutputTableName(conf, outTable);
VariantTableHelper.setInputTableName(conf, inTable);
VariantTableHelper gh = getHelper();
/* -------------------------------*/
// Validate input CHECK
HBaseManager hBaseManager = gh.getHBaseManager();
if (!hBaseManager.tableExists(inTable)) {
throw new IllegalArgumentException(String.format("Input table %s does not exist!!!", inTable));
}
/* -------------------------------*/
// JOB setup
setConf(conf);
Job job = createJob(outTable, fileArr);
// QUERY design
Scan scan = createScan(gh, fileArr);
// set other scan attrs
boolean addDependencyJar = conf.getBoolean(GenomeHelper.CONFIG_HBASE_ADD_DEPENDENCY_JARS, true);
initMapReduceJob(inTable, outTable, job, scan, addDependencyJar);
boolean succeed = executeJob(job);
if (!succeed) {
getLog().error("error with job!");
}
getStudyConfigurationManager().close();
return succeed ? 0 : 1;
}
/**
* Give the name of the action that the job is doing.
*
* Used to create the jobName and as {@link org.opencb.opencga.storage.core.metadata.BatchFileOperation#operationName}
*
* e.g. : "Delete", "Load", "Annotate", ...
*
* @return Job action
*/
protected abstract String getJobOperationName();
protected String[] argFileArray() {
return getConf().getStrings(CONFIG_VARIANT_FILE_IDS, new String[0]);
}
protected VariantTableHelper getHelper() {
if (null == variantTablehelper) {
variantTablehelper = new VariantTableHelper(getConf());
}
return variantTablehelper;
}
protected void initMapReduceJob(String inTable, String outTable, Job job, Scan scan, boolean addDependencyJar) throws IOException {
TableMapReduceUtil.initTableMapperJob(
inTable, // input table
scan, // Scan instance to control CF and attribute selection
getMapperClass(), // mapper class
null, // mapper output key
null, // mapper output value
job,
addDependencyJar);
TableMapReduceUtil.initTableReducerJob(
outTable, // output table
null, // reducer class
job,
null, null, null, null,
addDependencyJar);
job.setNumReduceTasks(0);
job.setOutputFormatClass(MultiTableOutputFormat.class);
}
protected boolean executeJob(Job job) throws IOException, InterruptedException, ClassNotFoundException {
Thread hook = new Thread(() -> {
try {
if (!job.isComplete()) {
job.killJob();
}
// onError();
} catch (IOException e) {
e.printStackTrace();
}
});
Runtime.getRuntime().addShutdownHook(hook);
boolean succeed = job.waitForCompletion(true);
Runtime.getRuntime().removeShutdownHook(hook);
return succeed;
}
private Logger getLog() {
return LOG;
}
protected Job createJob(String outTable, String[] fileArr) throws IOException {
Job job = Job.getInstance(getConf(), "opencga: " + getJobOperationName() + " file " + Arrays.toString(fileArr)
+ " on VariantTable '" + outTable + "'");
job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");
job.setJarByClass(getMapperClass()); // class that contains mapper
// Increase the ScannerTimeoutPeriod to avoid ScannerTimeoutExceptions
// See opencb/opencga#352 for more info.
int scannerTimeout = getConf().getInt(OPENCGA_STORAGE_HADOOP_MAPREDUCE_SCANNER_TIMEOUT,
getConf().getInt(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, HConstants.DEFAULT_HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD));
getLog().info("Set Scanner timeout to " + scannerTimeout + " ...");
job.getConfiguration().setInt(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, scannerTimeout);
return job;
}
protected Scan createScan(VariantTableHelper gh, String[] fileArr) {
Scan scan = new Scan();
int caching = getConf().getInt(HBASE_SCAN_CACHING, 50);
getLog().info("Scan set Caching to " + caching);
scan.setCaching(caching); // 1 is the default in Scan, 200 caused timeout issues.
scan.setCacheBlocks(false); // don't set to true for MR jobs
// https://hbase.apache.org/book.html#perf.hbase.client.seek
int lookAhead = getConf().getInt("hadoop.load.variant.scan.lookahead", -1);
if (lookAhead > 0) {
getLog().info("Scan set LOOKAHEAD to " + lookAhead);
scan.setAttribute(Scan.HINT_LOOKAHEAD, Bytes.toBytes(lookAhead));
}
// specify return columns (file IDs)
FilterList filter = new FilterList(FilterList.Operator.MUST_PASS_ONE);
for (String fileIdStr : fileArr) {
int id = Integer.parseInt(fileIdStr);
filter.addFilter(new ColumnRangeFilter(Bytes.toBytes(ArchiveHelper.getColumnName(id)), true,
Bytes.toBytes(ArchiveHelper.getColumnName(id)), true));
}
filter.addFilter(new ColumnPrefixFilter(GenomeHelper.VARIANT_COLUMN_B_PREFIX));
scan.setFilter(filter);
return scan;
}
protected StudyConfiguration loadStudyConfiguration() throws IOException {
HBaseStudyConfigurationManager scm = getStudyConfigurationManager();
int studyId = getHelper().getStudyId();
QueryResult<StudyConfiguration> res = scm.getStudyConfiguration(studyId, new QueryOptions());
if (res.getResult().size() != 1) {
throw new IllegalStateException("StudyConfiguration " + studyId + " not found! " + res.getResult().size());
}
return res.first();
}
protected HBaseStudyConfigurationManager getStudyConfigurationManager() throws IOException {
if (scm == null) {
byte[] outTable = getHelper().getOutputTable();
scm = new HBaseStudyConfigurationManager(Bytes.toString(outTable), getConf(), null);
}
return scm;
}
public static String buildCommandLineArgs(String server, String inputTable, String outputTable, int studyId,
List<Integer> fileIds, Map<String, Object> other) {
StringBuilder stringBuilder = new StringBuilder().append(server).append(' ').append(inputTable).append(' ')
.append(outputTable).append(' ').append(studyId).append(' ');
stringBuilder.append(fileIds.stream().map(Object::toString).collect(Collectors.joining(",")));
ArchiveDriver.addOtherParams(other, stringBuilder);
return stringBuilder.toString();
}
public static boolean createVariantTableIfNeeded(GenomeHelper genomeHelper, String tableName) throws IOException {
try (Connection con = ConnectionFactory.createConnection(genomeHelper.getConf())) {
return createVariantTableIfNeeded(genomeHelper, tableName, con);
}
}
public static boolean createVariantTableIfNeeded(GenomeHelper genomeHelper, String tableName, Connection con)
throws IOException {
VariantPhoenixHelper variantPhoenixHelper = new VariantPhoenixHelper(genomeHelper);
String namespace = SchemaUtil.getSchemaNameFromFullName(tableName);
if (StringUtils.isNotEmpty(namespace)) {
// HBaseManager.createNamespaceIfNeeded(con, namespace);
try (java.sql.Connection jdbcConnection = variantPhoenixHelper.newJdbcConnection()) {
variantPhoenixHelper.createSchemaIfNeeded(jdbcConnection, namespace);
LOG.info("Phoenix connection is autoclosed ... " + jdbcConnection);
} catch (ClassNotFoundException | SQLException e) {
throw new IOException(e);
}
}
int nsplits = genomeHelper.getConf().getInt(CONFIG_VARIANT_TABLE_PRESPLIT_SIZE, 100);
List<byte[]> splitList = GenomeHelper.generateBootPreSplitsHuman(
nsplits,
(chr, pos) -> genomeHelper.generateVariantRowKey(chr, pos, "", ""));
boolean newTable = HBaseManager.createTableIfNeeded(con, tableName, genomeHelper.getColumnFamily(),
splitList, Compression.getCompressionAlgorithmByName(
genomeHelper.getConf().get(
CONFIG_VARIANT_TABLE_COMPRESSION,
Compression.Algorithm.SNAPPY.getName())));
if (newTable) {
try (java.sql.Connection jdbcConnection = variantPhoenixHelper.newJdbcConnection()) {
variantPhoenixHelper.createTableIfNeeded(jdbcConnection, tableName);
LOG.info("Phoenix connection is autoclosed ... " + jdbcConnection);
} catch (ClassNotFoundException | SQLException e) {
throw new IOException(e);
}
}
return newTable;
}
public static String[] configure(String[] args, Configured configured) throws Exception {
// info https://code.google.com/p/temapred/wiki/HbaseWithJava
Configuration conf = configured.getConf();
if (conf == null) {
throw new NullPointerException("Provided Configuration is null!!!");
}
GenericOptionsParser parser = new GenericOptionsParser(conf, args);
//get the args w/o generic hadoop args
String[] toolArgs = parser.getRemainingArgs();
int fixedSizeArgs = 5;
if (toolArgs.length < fixedSizeArgs || (toolArgs.length - fixedSizeArgs) % 2 != 0) {
System.err.printf("Usage: %s [generic options] <server> <input-table> <output-table> <studyId> <fileIds>"
+ " [<key> <value>]*\n",
AbstractVariantTableDriver.class.getSimpleName());
System.err.println("Found " + Arrays.toString(toolArgs));
ToolRunner.printGenericCommandUsage(System.err);
return null;
}
conf = HBaseManager.addHBaseSettings(conf, toolArgs[0]);
conf.set(ArchiveDriver.CONFIG_ARCHIVE_TABLE_NAME, toolArgs[1]);
conf.set(CONFIG_VARIANT_TABLE_NAME, toolArgs[2]);
conf.set(GenomeHelper.CONFIG_STUDY_ID, toolArgs[3]);
conf.setStrings(CONFIG_VARIANT_FILE_IDS, toolArgs[4].split(","));
for (int i = fixedSizeArgs; i < toolArgs.length; i = i + 2) {
conf.set(toolArgs[i], toolArgs[i + 1]);
}
configured.setConf(conf);
return toolArgs;
}
}