package org.opencb.opencga.storage.hadoop.variant;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryResult;
import org.opencb.opencga.storage.core.exceptions.StorageEngineException;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.hadoop.utils.HBaseManager;
import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveDriver;
import org.opencb.opencga.storage.hadoop.variant.index.VariantTableHelper;
import org.opencb.opencga.storage.hadoop.variant.metadata.HBaseStudyConfigurationManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.stream.Collectors;
import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine.OPENCGA_STORAGE_HADOOP_MAPREDUCE_SCANNER_TIMEOUT;
import static org.opencb.opencga.storage.hadoop.variant.index.AbstractVariantTableDriver.CONFIG_VARIANT_FILE_IDS;
import static org.opencb.opencga.storage.hadoop.variant.index.AbstractVariantTableDriver.CONFIG_VARIANT_TABLE_NAME;
/**
* Created by mh719 on 21/11/2016.
*/
public abstract class AbstractAnalysisTableDriver extends Configured implements Tool {
protected final Logger LOG = LoggerFactory.getLogger(this.getClass());
private VariantTableHelper variantTablehelper;
private HBaseStudyConfigurationManager scm;
public AbstractAnalysisTableDriver() { /* nothing */ }
public AbstractAnalysisTableDriver(Configuration conf) {
super(conf);
}
@Override
public int run(String[] args) throws Exception {
int fixedSizeArgs = 5;
configFromArgs(args, fixedSizeArgs);
Configuration conf = getConf();
String archiveTable = conf.get(ArchiveDriver.CONFIG_ARCHIVE_TABLE_NAME, StringUtils.EMPTY);
String variantTable = conf.get(CONFIG_VARIANT_TABLE_NAME, StringUtils.EMPTY);
Integer studyId = conf.getInt(GenomeHelper.CONFIG_STUDY_ID, -1);
/* -------------------------------*/
// Validate parameters CHECK
if (StringUtils.isEmpty(archiveTable)) {
throw new IllegalArgumentException("No archive hbase table basename specified!!!");
}
if (StringUtils.isEmpty(variantTable)) {
throw new IllegalArgumentException("No variant hbase table specified!!!");
}
if (archiveTable.equals(variantTable)) {
throw new IllegalArgumentException("archive and variant tables must be different");
}
if (studyId < 0) {
throw new IllegalArgumentException("No Study id specified!!!");
}
parseAndValidateParameters();
getLog().info(String.format("Use table %s as input", variantTable));
GenomeHelper.setStudyId(conf, studyId);
VariantTableHelper.setOutputTableName(conf, variantTable);
VariantTableHelper.setInputTableName(conf, archiveTable);
VariantTableHelper gh = getHelper();
/* -------------------------------*/
// Validate input CHECK
checkTablesExist(gh, archiveTable, variantTable);
// Check File(s) or Study is specified
List<Integer> fileIds = getFilesToUse();
/* -------------------------------*/
// JOB setup
setConf(conf);
Job job = createJob(variantTable, fileIds);
// QUERY design
Scan scan = createScan();
// set other scan attrs
boolean addDependencyJar = conf.getBoolean(GenomeHelper.CONFIG_HBASE_ADD_DEPENDENCY_JARS, true);
initMapReduceJob(variantTable, job, scan, addDependencyJar);
preExecution(variantTable);
boolean succeed = executeJob(job);
if (!succeed) {
getLog().error("error with job!");
}
postExecution(succeed);
getStudyConfigurationManager().close();
return succeed ? 0 : 1;
}
protected void preExecution(String variantTable) throws IOException, StorageEngineException {
// do nothing
}
protected void postExecution(boolean succeed) throws IOException, StorageEngineException {
// do nothing
}
protected abstract void parseAndValidateParameters();
protected boolean executeJob(Job job) throws IOException, InterruptedException, ClassNotFoundException {
Thread hook = new Thread(() -> {
try {
if (!job.isComplete()) {
job.killJob();
}
// onError();
} catch (IOException e) {
getLog().error("Error", e);
}
});
Runtime.getRuntime().addShutdownHook(hook);
boolean succeed = job.waitForCompletion(true);
Runtime.getRuntime().removeShutdownHook(hook);
return succeed;
}
protected abstract Class<? extends TableMapper> getMapperClass();
protected void initMapReduceJob(String inTable, Job job, Scan scan, boolean addDependencyJar)
throws IOException {
TableMapReduceUtil.initTableMapperJob(
inTable, // input table
scan, // Scan instance to control CF and attribute selection
getMapperClass(), // mapper class
null, // mapper output key
null, // mapper output value
job,
addDependencyJar);
}
protected Scan createScan() {
Scan scan = new Scan();
// int caching = getConf().getInt(AbstractAnalysisTableDriver.HBASE_SCAN_CACHING, 50);
// getLog().info("Scan set Caching to " + caching);
// scan.setCaching(caching); // 1 is the default in Scan, 200 caused timeout issues.
scan.setCacheBlocks(false); // don't set to true for MR jobs
scan.addFamily(getHelper().getColumnFamily()); // Ignore PHOENIX columns!!!
return scan;
}
protected Job createJob(String variantTable, List<Integer> files) throws IOException {
Job job = Job.getInstance(getConf(), "opencga: Export files " + files
+ " from VariantTable '" + variantTable + "'");
job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");
job.setJarByClass(getMapperClass()); // class that contains mapper
int scannerTimeout = getConf().getInt(OPENCGA_STORAGE_HADOOP_MAPREDUCE_SCANNER_TIMEOUT,
getConf().getInt(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, HConstants.DEFAULT_HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD));
getLog().info("Set Scanner timeout to " + scannerTimeout + " ...");
job.getConfiguration().setInt(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, scannerTimeout);
return job;
}
protected List<Integer> getFilesToUse() throws IOException {
StudyConfiguration studyConfiguration = loadStudyConfiguration();
LinkedHashSet<Integer> indexedFiles = studyConfiguration.getIndexedFiles();
String[] fileArr = getConf().getStrings(CONFIG_VARIANT_FILE_IDS, new String[0]);
List<Integer> files = Arrays.stream(fileArr).map(s -> Integer.parseInt(s)).collect(Collectors.toList());
if (files.isEmpty()) { // no files specified - use all indexed files for study
files = new ArrayList<>(indexedFiles);
} else { // Validate that they exist
List<Integer> notIndexed = files.stream().filter(fid -> !indexedFiles.contains(fid))
.collect(Collectors.toList());
if (!notIndexed.isEmpty()) {
throw new IllegalStateException("Provided File ID(s) not indexed!!!" + notIndexed);
}
}
if (files.isEmpty()) { // if still empty (no files provided and / or found in study
throw new IllegalArgumentException("No files specified / available for study "
+ getHelper().getStudyId());
}
return files;
}
protected StudyConfiguration loadStudyConfiguration() throws IOException {
HBaseStudyConfigurationManager scm = getStudyConfigurationManager();
int studyId = getHelper().getStudyId();
QueryResult<StudyConfiguration> res = scm.getStudyConfiguration(studyId, new QueryOptions());
if (res.getResult().size() != 1) {
throw new IllegalStateException("StudyConfiguration " + studyId + " not found! " + res.getResult().size());
}
return res.first();
}
protected HBaseStudyConfigurationManager getStudyConfigurationManager() throws IOException {
if (scm == null) {
byte[] outTable = getHelper().getOutputTable();
scm = new HBaseStudyConfigurationManager(Bytes.toString(outTable), getConf(), null);
}
return scm;
}
private void checkTablesExist(GenomeHelper genomeHelper, String... tables) {
final HBaseManager hBaseManager = genomeHelper.getHBaseManager();
Arrays.stream(tables).forEach(table -> {
try {
if (!hBaseManager.tableExists(table)) {
throw new IllegalArgumentException(String.format("Table %s does not exist!!!", table));
}
} catch (IOException e) {
throw new IllegalStateException(e);
}
});
}
protected VariantTableHelper getHelper() {
if (null == variantTablehelper) {
variantTablehelper = new VariantTableHelper(getConf());
}
return variantTablehelper;
}
protected void configFromArgs(String[] args, int fixedSizeArgs) {
getConf().set(ArchiveDriver.CONFIG_ARCHIVE_TABLE_NAME, args[1]);
getConf().set(CONFIG_VARIANT_TABLE_NAME, args[2]);
getConf().set(GenomeHelper.CONFIG_STUDY_ID, args[3]);
getConf().setStrings(CONFIG_VARIANT_FILE_IDS, args[4].split(","));
for (int i = fixedSizeArgs; i < args.length; i = i + 2) {
getConf().set(args[i], args[i + 1]);
}
}
public Logger getLog() {
return LOG;
}
public static int privateMain(String[] args, Configuration conf, AbstractAnalysisTableDriver driver) throws Exception {
// info https://code.google.com/p/temapred/wiki/HbaseWithJava
if (conf == null) {
conf = HBaseConfiguration.create();
}
driver.setConf(conf);
int exitCode = ToolRunner.run(driver, args);
return exitCode;
}
}