/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.hadoop.variant;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.opencb.biodata.formats.io.FileFormatException;
import org.opencb.biodata.formats.variant.io.VariantReader;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantNormalizer;
import org.opencb.biodata.models.variant.VariantSource;
import org.opencb.biodata.models.variant.protobuf.VcfMeta;
import org.opencb.biodata.models.variant.protobuf.VcfSliceProtos;
import org.opencb.biodata.tools.variant.VariantFileUtils;
import org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader;
import org.opencb.biodata.tools.variant.converters.proto.VariantToVcfSliceConverter;
import org.opencb.biodata.tools.variant.stats.VariantGlobalStatsCalculator;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.io.DataWriter;
import org.opencb.commons.run.ParallelTaskRunner;
import org.opencb.commons.utils.FileUtils;
import org.opencb.hpg.bigdata.core.io.ProtoFileWriter;
import org.opencb.opencga.core.common.ProgressLogger;
import org.opencb.opencga.storage.core.config.StorageConfiguration;
import org.opencb.opencga.storage.core.exceptions.StorageEngineException;
import org.opencb.opencga.storage.core.metadata.BatchFileOperation;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.io.plain.StringDataWriter;
import org.opencb.opencga.storage.core.variant.VariantStoragePipeline;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.annotation.VariantAnnotationManager;
import org.opencb.opencga.storage.core.variant.io.VariantReaderUtils;
import org.opencb.opencga.storage.core.variant.io.json.mixin.GenericRecordAvroJsonMixin;
import org.opencb.opencga.storage.core.variant.io.json.mixin.VariantSourceJsonMixin;
import org.opencb.opencga.storage.hadoop.auth.HBaseCredentials;
import org.opencb.opencga.storage.hadoop.exceptions.StorageHadoopException;
import org.opencb.opencga.storage.hadoop.variant.adaptors.HadoopVariantSourceDBAdaptor;
import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor;
import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveDriver;
import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveHelper;
import org.opencb.opencga.storage.hadoop.variant.archive.VariantHbaseTransformTask;
import org.opencb.opencga.storage.hadoop.variant.executors.MRExecutor;
import org.opencb.opencga.storage.hadoop.variant.index.AbstractVariantTableDriver;
import org.opencb.opencga.storage.hadoop.variant.index.VariantTableDriver;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.PhoenixHelper;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.VariantPhoenixHelper;
import org.opencb.opencga.storage.hadoop.variant.transform.VariantSliceReader;
import org.slf4j.Logger;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.function.BiConsumer;
import java.util.function.Supplier;
import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine.*;
/**
* Created by mh719 on 13/05/2016.
*/
public abstract class AbstractHadoopVariantStoragePipeline extends VariantStoragePipeline {
protected final VariantHadoopDBAdaptor dbAdaptor;
protected final Configuration conf;
protected final HBaseCredentials archiveTableCredentials;
protected final HBaseCredentials variantsTableCredentials;
protected MRExecutor mrExecutor = null;
// Do not create phoenix indexes. Testing purposes only
public static final String SKIP_CREATE_PHOENIX_INDEXES = "skip.create.phoenix.indexes";
public AbstractHadoopVariantStoragePipeline(
StorageConfiguration configuration, String storageEngineId, Logger logger,
VariantHadoopDBAdaptor dbAdaptor,
VariantReaderUtils variantReaderUtils, ObjectMap options,
HBaseCredentials archiveCredentials, MRExecutor mrExecutor,
Configuration conf) {
super(configuration, storageEngineId, logger, dbAdaptor, variantReaderUtils, options);
this.archiveTableCredentials = archiveCredentials;
this.mrExecutor = mrExecutor;
this.dbAdaptor = dbAdaptor;
this.variantsTableCredentials = dbAdaptor == null ? null : dbAdaptor.getCredentials();
this.conf = new Configuration(conf);
}
@Override
public URI preTransform(URI input) throws StorageEngineException, IOException, FileFormatException {
logger.info("PreTransform: " + input);
// ObjectMap options = configuration.getStorageEngine(STORAGE_ENGINE_ID).getVariant().getOptions();
if (!options.containsKey(VariantStorageEngine.Options.TRANSFORM_FORMAT.key())) {
options.put(VariantStorageEngine.Options.TRANSFORM_FORMAT.key(),
VariantStorageEngine.Options.TRANSFORM_FORMAT.defaultValue());
}
String transVal = options.getString(VariantStorageEngine.Options.TRANSFORM_FORMAT.key());
switch (transVal){
case "avro":
case "proto":
break;
default:
throw new NotImplementedException(String.format("Output format %s not supported for Hadoop!", transVal));
}
if (!options.containsKey(VariantStorageEngine.Options.GVCF.key())) {
options.put(VariantStorageEngine.Options.GVCF.key(), true);
}
boolean isGvcf = options.getBoolean(VariantStorageEngine.Options.GVCF.key());
if (!isGvcf) {
throw new NotImplementedException("Only GVCF format supported!!!");
}
return super.preTransform(input);
}
@Override
protected Pair<Long, Long> processProto(Path input, String fileName, Path output, VariantSource source, Path outputVariantsFile,
Path outputMetaFile, boolean includeSrc, String parser, boolean generateReferenceBlocks,
int batchSize, String extension, String compression,
BiConsumer<String, RuntimeException> malformatedHandler, boolean failOnError)
throws StorageEngineException {
//Writer
DataWriter<VcfSliceProtos.VcfSlice> dataWriter = new ProtoFileWriter<>(outputVariantsFile, compression);
// Normalizer
VariantNormalizer normalizer = new VariantNormalizer();
normalizer.setGenerateReferenceBlocks(generateReferenceBlocks);
// Stats calculator
VariantGlobalStatsCalculator statsCalculator = new VariantGlobalStatsCalculator(source);
VariantReader dataReader = null;
try {
if (VariantReaderUtils.isVcf(input.toString())) {
InputStream inputStream = FileUtils.newInputStream(input);
VariantVcfHtsjdkReader reader = new VariantVcfHtsjdkReader(inputStream, source, normalizer);
if (null != malformatedHandler) {
reader.registerMalformatedVcfHandler(malformatedHandler);
reader.setFailOnError(failOnError);
}
dataReader = reader;
} else {
dataReader = VariantReaderUtils.getVariantReader(input, source);
}
} catch (IOException e) {
throw new StorageEngineException("Unable to read from " + input, e);
}
// Transformer
VcfMeta meta = new VcfMeta(source);
ArchiveHelper helper = new ArchiveHelper(conf, meta);
ProgressLogger progressLogger = new ProgressLogger("Transform proto:").setBatchSize(100000);
logger.info("Generating output file {}", outputVariantsFile);
long start = System.currentTimeMillis();
long end;
// FIXME
if (options.getBoolean("transform.proto.parallel")) {
VariantSliceReader sliceReader = new VariantSliceReader(helper.getChunkSize(), dataReader);
// Use a supplier to avoid concurrent modifications of non thread safe objects.
Supplier<ParallelTaskRunner.TaskWithException<ImmutablePair<Long, List<Variant>>, VcfSliceProtos.VcfSlice, ?>> supplier =
() -> {
VariantToVcfSliceConverter converter = new VariantToVcfSliceConverter();
return batch -> {
List<VcfSliceProtos.VcfSlice> slices = new ArrayList<>(batch.size());
for (ImmutablePair<Long, List<Variant>> pair : batch) {
slices.add(converter.convert(pair.getRight(), pair.getLeft().intValue()));
progressLogger.increment(pair.getRight().size());
}
return slices;
};
};
ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder()
.setNumTasks(options.getInt(Options.TRANSFORM_THREADS.key(), 1))
.setBatchSize(1)
.setAbortOnFail(true)
.setSorted(false)
.setCapacity(1)
.build();
ParallelTaskRunner<ImmutablePair<Long, List<Variant>>, VcfSliceProtos.VcfSlice> ptr;
ptr = new ParallelTaskRunner<>(sliceReader, supplier, dataWriter, config);
try {
ptr.run();
} catch (ExecutionException e) {
throw new StorageEngineException(String.format("Error while Transforming file %s into %s", input, outputVariantsFile), e);
}
end = System.currentTimeMillis();
} else {
VariantHbaseTransformTask transformTask = new VariantHbaseTransformTask(helper, null);
long[] t = new long[]{0, 0, 0};
long last = System.nanoTime();
try {
dataReader.open();
dataReader.pre();
dataWriter.open();
dataWriter.pre();
transformTask.pre();
statsCalculator.pre();
start = System.currentTimeMillis();
last = System.nanoTime();
// Process data
List<Variant> read = dataReader.read(batchSize);
t[0] += System.nanoTime() - last;
last = System.nanoTime();
while (!read.isEmpty()) {
progressLogger.increment(read.size());
statsCalculator.apply(read);
List<VcfSliceProtos.VcfSlice> slices = transformTask.apply(read);
t[1] += System.nanoTime() - last;
last = System.nanoTime();
dataWriter.write(slices);
t[2] += System.nanoTime() - last;
last = System.nanoTime();
read = dataReader.read(batchSize);
t[0] += System.nanoTime() - last;
last = System.nanoTime();
}
List<VcfSliceProtos.VcfSlice> drain = transformTask.drain();
t[1] += System.nanoTime() - last;
last = System.nanoTime();
dataWriter.write(drain);
t[2] += System.nanoTime() - last;
end = System.currentTimeMillis();
source.getMetadata().put(VariantFileUtils.VARIANT_FILE_HEADER, dataReader.getHeader());
statsCalculator.post();
transformTask.post();
dataReader.post();
dataWriter.post();
end = System.currentTimeMillis();
logger.info("Times for reading: {}, transforming {}, writing {}",
TimeUnit.NANOSECONDS.toSeconds(t[0]),
TimeUnit.NANOSECONDS.toSeconds(t[1]),
TimeUnit.NANOSECONDS.toSeconds(t[2]));
} catch (Exception e) {
throw new StorageEngineException(String.format("Error while Transforming file %s into %s", input, outputVariantsFile), e);
} finally {
dataWriter.close();
dataReader.close();
}
}
ObjectMapper jsonObjectMapper = new ObjectMapper();
jsonObjectMapper.addMixIn(VariantSource.class, VariantSourceJsonMixin.class);
jsonObjectMapper.addMixIn(GenericRecord.class, GenericRecordAvroJsonMixin.class);
ObjectWriter variantSourceObjectWriter = jsonObjectMapper.writerFor(VariantSource.class);
try {
String sourceJsonString = variantSourceObjectWriter.writeValueAsString(source);
StringDataWriter.write(outputMetaFile, Collections.singletonList(sourceJsonString));
} catch (IOException e) {
throw new StorageEngineException("Error writing meta file", e);
}
return new ImmutablePair<>(start, end);
}
@Override
public URI preLoad(URI input, URI output) throws StorageEngineException {
boolean loadArch = options.getBoolean(HADOOP_LOAD_ARCHIVE);
boolean loadVar = options.getBoolean(HADOOP_LOAD_VARIANT);
if (!loadArch && !loadVar) {
loadArch = true;
loadVar = true;
options.put(HADOOP_LOAD_ARCHIVE, loadArch);
options.put(HADOOP_LOAD_VARIANT, loadVar);
}
if (loadArch) {
super.preLoad(input, output);
if (needLoadFromHdfs() && !input.getScheme().equals("hdfs")) {
if (!StringUtils.isEmpty(options.getString(OPENCGA_STORAGE_HADOOP_INTERMEDIATE_HDFS_DIRECTORY))) {
output = URI.create(options.getString(OPENCGA_STORAGE_HADOOP_INTERMEDIATE_HDFS_DIRECTORY));
}
if (output.getScheme() != null && !output.getScheme().equals("hdfs")) {
throw new StorageEngineException("Output must be in HDFS");
}
try {
long startTime = System.currentTimeMillis();
// Configuration conf = getHadoopConfiguration(options);
FileSystem fs = FileSystem.get(conf);
org.apache.hadoop.fs.Path variantsOutputPath = new org.apache.hadoop.fs.Path(
output.resolve(Paths.get(input.getPath()).getFileName().toString()));
logger.info("Copy from {} to {}", new org.apache.hadoop.fs.Path(input).toUri(), variantsOutputPath.toUri());
fs.copyFromLocalFile(false, new org.apache.hadoop.fs.Path(input), variantsOutputPath);
logger.info("Copied to hdfs in {}s", (System.currentTimeMillis() - startTime) / 1000.0);
startTime = System.currentTimeMillis();
URI fileInput = URI.create(VariantReaderUtils.getMetaFromTransformedFile(input.toString()));
org.apache.hadoop.fs.Path fileOutputPath = new org.apache.hadoop.fs.Path(
output.resolve(Paths.get(fileInput.getPath()).getFileName().toString()));
logger.info("Copy from {} to {}", new org.apache.hadoop.fs.Path(fileInput).toUri(), fileOutputPath.toUri());
fs.copyFromLocalFile(false, new org.apache.hadoop.fs.Path(fileInput), fileOutputPath);
logger.info("Copied to hdfs in {}s", (System.currentTimeMillis() - startTime) / 1000.0);
input = variantsOutputPath.toUri();
} catch (IOException e) {
e.printStackTrace();
}
}
}
try {
ArchiveDriver.createArchiveTableIfNeeded(dbAdaptor.getGenomeHelper(), archiveTableCredentials.getTable(),
dbAdaptor.getConnection());
} catch (IOException e) {
throw new StorageHadoopException("Issue creating table " + archiveTableCredentials.getTable(), e);
}
try {
VariantTableDriver.createVariantTableIfNeeded(dbAdaptor.getGenomeHelper(), variantsTableCredentials.getTable(),
dbAdaptor.getConnection());
} catch (IOException e) {
throw new StorageHadoopException("Issue creating table " + variantsTableCredentials.getTable(), e);
}
if (loadVar) {
preMerge(input);
}
return input;
}
protected void preMerge(URI input) throws StorageEngineException {
int studyId = getStudyId();
VariantPhoenixHelper phoenixHelper = new VariantPhoenixHelper(dbAdaptor.getGenomeHelper());
try {
Connection jdbcConnection = dbAdaptor.getJdbcConnection();
String tableName = variantsTableCredentials.getTable();
phoenixHelper.registerNewStudy(jdbcConnection, tableName, studyId);
if (!options.getBoolean(SKIP_CREATE_PHOENIX_INDEXES, false)) {
if (options.getString(VariantAnnotationManager.SPECIES, "hsapiens").equalsIgnoreCase("hsapiens")) {
List<PhoenixHelper.Column> columns = VariantPhoenixHelper.getHumanPopulationFrequenciesColumns();
phoenixHelper.getPhoenixHelper().addMissingColumns(jdbcConnection, tableName, columns, true);
List<PhoenixHelper.Index> popFreqIndices = VariantPhoenixHelper.getPopFreqIndices(tableName);
phoenixHelper.getPhoenixHelper().createIndexes(jdbcConnection, tableName, popFreqIndices, false);
}
phoenixHelper.createVariantIndexes(jdbcConnection, tableName);
} else {
logger.info("Skip create indexes!!");
}
} catch (SQLException e) {
throw new StorageEngineException("Unable to register study in Phoenix", e);
}
long lock = dbAdaptor.getStudyConfigurationManager().lockStudy(studyId);
//Get the studyConfiguration. If there is no StudyConfiguration, create a empty one.
try {
StudyConfiguration studyConfiguration = checkOrCreateStudyConfiguration(true);
VariantSource source = readVariantSource(input, options);
securePreMerge(studyConfiguration, source);
dbAdaptor.getStudyConfigurationManager().updateStudyConfiguration(studyConfiguration, null);
} finally {
dbAdaptor.getStudyConfigurationManager().unLockStudy(studyId, lock);
}
}
protected void securePreMerge(StudyConfiguration studyConfiguration, VariantSource source) throws StorageEngineException {
boolean loadArch = options.getBoolean(HADOOP_LOAD_ARCHIVE);
boolean loadVar = options.getBoolean(HADOOP_LOAD_VARIANT);
if (loadVar) {
// Load into variant table
// Update the studyConfiguration with data from the Archive Table.
// Reads the VcfMeta documents, and populates the StudyConfiguration if needed.
// Obtain the list of pending files.
int studyId = options.getInt(VariantStorageEngine.Options.STUDY_ID.key(), -1);
int fileId = options.getInt(VariantStorageEngine.Options.FILE_ID.key(), -1);
boolean missingFilesDetected = false;
HadoopVariantSourceDBAdaptor fileMetadataManager = dbAdaptor.getVariantSourceDBAdaptor();
Set<Integer> files = null;
try {
files = fileMetadataManager.getLoadedFiles(studyId);
} catch (IOException e) {
throw new StorageHadoopException("Unable to read loaded files", e);
}
logger.info("Found files in Archive DB: " + files);
// Pending files, not in analysis but in archive.
List<Integer> pendingFiles = new LinkedList<>();
logger.info("Found registered indexed files: {}", studyConfiguration.getIndexedFiles());
for (Integer loadedFileId : files) {
VariantSource readSource;
try {
readSource = fileMetadataManager.getVariantSource(studyId, loadedFileId, null);
} catch (IOException e) {
throw new StorageHadoopException("Unable to read file VcfMeta for file : " + loadedFileId, e);
}
Integer readFileId = Integer.parseInt(readSource.getFileId());
logger.debug("Found source for file id {} with registered id {} ", loadedFileId, readFileId);
if (!studyConfiguration.getFileIds().inverse().containsKey(readFileId)) {
checkNewFile(studyConfiguration, readFileId, readSource.getFileName());
studyConfiguration.getFileIds().put(readSource.getFileName(), readFileId);
// studyConfiguration.getHeaders().put(readFileId, readSource.getMetadata()
// .get(VariantFileUtils.VARIANT_FILE_HEADER).toString());
checkAndUpdateStudyConfiguration(studyConfiguration, readFileId, readSource, options);
missingFilesDetected = true;
}
if (!studyConfiguration.getIndexedFiles().contains(readFileId)) {
pendingFiles.add(readFileId);
}
}
logger.info("Found pending in DB: " + pendingFiles);
fileId = checkNewFile(studyConfiguration, fileId, source.getFileName());
if (!loadArch) {
//If skip archive loading, input fileId must be already in archiveTable, so "pending to be loaded"
if (!pendingFiles.contains(fileId)) {
throw new StorageEngineException("File " + fileId + " is not loaded in archive table "
+ getArchiveTableName(studyId, options) + "");
}
} else {
//If don't skip archive, input fileId must not be pending, because must not be in the archive table.
if (pendingFiles.contains(fileId)) {
// set loadArch to false?
throw new StorageEngineException("File " + fileId + " is not loaded in archive table");
} else {
pendingFiles.add(fileId);
}
}
//If there are some given pending files, load only those files, not all pending files
List<Integer> givenPendingFiles = options.getAsIntegerList(HADOOP_LOAD_VARIANT_PENDING_FILES);
if (!givenPendingFiles.isEmpty()) {
logger.info("Given Pending file list: " + givenPendingFiles);
for (Integer pendingFile : givenPendingFiles) {
if (!pendingFiles.contains(pendingFile)) {
throw new StorageEngineException("File " + pendingFile + " is not pending to be loaded in variant table");
}
}
pendingFiles = givenPendingFiles;
} else {
options.put(HADOOP_LOAD_VARIANT_PENDING_FILES, pendingFiles);
}
boolean resume = options.getBoolean(Options.RESUME.key(), Options.RESUME.defaultValue())
|| options.getBoolean(HadoopVariantStorageEngine.HADOOP_LOAD_VARIANT_RESUME, false);
BatchFileOperation op = addBatchOperation(studyConfiguration, VariantTableDriver.JOB_OPERATION_NAME, pendingFiles, resume,
BatchFileOperation.Type.LOAD);
options.put(HADOOP_LOAD_VARIANT_STATUS, op.currentStatus());
options.put(AbstractVariantTableDriver.TIMESTAMP, op.getTimestamp());
}
}
/**
* Adds a new {@BatchOperation} to the StudyConfiguration.
*
* Only allow one running operation at the same time
* If the last operation is ready, continue
* If the last operation is in ERROR, continue if is the same operation and files.
* If the last operation is running, continue only if resume=true
*
* If is a new operation, increment the TimeStamp
*
* @param studyConfiguration StudyConfiguration
* @param jobOperationName Job operation name used to create the jobName and as {@link BatchFileOperation#operationName}
* @param fileIds Files to be processed in this batch.
* @param resume Resume operation. Assume that previous operation went wrong.
* @param type Operation type as {@link BatchFileOperation#type}
* @return The current batchOperation
* @throws StorageEngineException if the operation can't be executed
*/
protected BatchFileOperation addBatchOperation(StudyConfiguration studyConfiguration, String jobOperationName, List<Integer> fileIds,
boolean resume, BatchFileOperation.Type type)
throws StorageEngineException {
List<BatchFileOperation> batches = studyConfiguration.getBatches();
BatchFileOperation batchFileOperation;
boolean newOperation = false;
if (!batches.isEmpty()) {
batchFileOperation = batches.get(batches.size() - 1);
BatchFileOperation.Status currentStatus = batchFileOperation.currentStatus();
switch (currentStatus) {
case READY:
batchFileOperation = new BatchFileOperation(jobOperationName, fileIds, batchFileOperation.getTimestamp() + 1, type);
newOperation = true;
break;
case DONE:
case RUNNING:
if (!resume) {
throw new StorageHadoopException("Unable to process a new batch. Ongoing batch operation: "
+ batchFileOperation);
}
// DO NOT BREAK!. Resuming last loading, go to error case.
case ERROR:
Collections.sort(fileIds);
Collections.sort(batchFileOperation.getFileIds());
if (batchFileOperation.getFileIds().equals(fileIds)) {
logger.info("Resuming Last batch loading due to error.");
} else {
throw new StorageHadoopException("Unable to resume last batch operation. "
+ "Must have the same files from the previous batch: " + batchFileOperation);
}
break;
default:
throw new IllegalArgumentException("Unknown Status " + currentStatus);
}
} else {
batchFileOperation = new BatchFileOperation(jobOperationName, fileIds, 1, type);
newOperation = true;
}
if (!Objects.equals(batchFileOperation.currentStatus(), BatchFileOperation.Status.DONE)) {
batchFileOperation.addStatus(Calendar.getInstance().getTime(), BatchFileOperation.Status.RUNNING);
}
if (newOperation) {
batches.add(batchFileOperation);
}
return batchFileOperation;
}
/**
* Specify if the current class needs to move the file to load to HDFS.
*
* If true, the transformed file will be copied to hdfs during the {@link #preLoad}
*
* @return boolean
*/
protected abstract boolean needLoadFromHdfs();
@Override
public URI load(URI input) throws IOException, StorageEngineException {
int studyId = getStudyId();
int fileId = options.getInt(Options.FILE_ID.key());
boolean loadArch = options.getBoolean(HADOOP_LOAD_ARCHIVE);
boolean loadVar = options.getBoolean(HADOOP_LOAD_VARIANT);
ArchiveHelper.setChunkSize(conf, conf.getInt(ArchiveDriver.CONFIG_ARCHIVE_CHUNK_SIZE, ArchiveDriver.DEFAULT_CHUNK_SIZE));
ArchiveHelper.setStudyId(conf, studyId);
if (loadArch) {
Set<Integer> loadedFiles = dbAdaptor.getVariantSourceDBAdaptor().getLoadedFiles(studyId);
if (!loadedFiles.contains(fileId)) {
loadArch(input);
} else {
logger.info("File {} already loaded in archive table. Skip this step!",
Paths.get(input.getPath()).getFileName().toString());
}
}
if (loadVar) {
List<Integer> pendingFiles = options.getAsIntegerList(HADOOP_LOAD_VARIANT_PENDING_FILES);
merge(studyId, pendingFiles);
}
return input; // TODO change return value?
}
protected abstract void loadArch(URI input) throws StorageEngineException;
public void merge(int studyId, List<Integer> pendingFiles) throws StorageEngineException {
// Check if status is "DONE"
if (options.get(HADOOP_LOAD_VARIANT_STATUS, BatchFileOperation.Status.class).equals(BatchFileOperation.Status.DONE)) {
// Merge operation status : DONE, not READY or RUNNING
// Don't need to merge again. Skip merge and run post-load/post-merge step
logger.info("Files {} already merged!", pendingFiles);
return;
}
String hadoopRoute = options.getString(HADOOP_BIN, "hadoop");
String jar = getJarWithDependencies();
options.put(HADOOP_LOAD_VARIANT_PENDING_FILES, pendingFiles);
Class execClass = VariantTableDriver.class;
String args = VariantTableDriver.buildCommandLineArgs(variantsTableCredentials.toString(),
archiveTableCredentials.getTable(),
variantsTableCredentials.getTable(), studyId, pendingFiles, options);
String executable = hadoopRoute + " jar " + jar + ' ' + execClass.getName();
long startTime = System.currentTimeMillis();
Thread hook = newShutdownHook(VariantTableDriver.JOB_OPERATION_NAME, pendingFiles);
Runtime.getRuntime().addShutdownHook(hook);
try {
logger.info("------------------------------------------------------");
logger.info("Loading files {} into analysis table '{}'", pendingFiles, variantsTableCredentials.getTable());
logger.info(executable + " " + args);
logger.info("------------------------------------------------------");
int exitValue = mrExecutor.run(executable, args);
logger.info("------------------------------------------------------");
logger.info("Exit value: {}", exitValue);
logger.info("Total time: {}s", (System.currentTimeMillis() - startTime) / 1000.0);
if (exitValue != 0) {
throw new StorageEngineException("Error loading files " + pendingFiles + " into variant table \""
+ variantsTableCredentials.getTable() + "\"");
}
setStatus(BatchFileOperation.Status.DONE, VariantTableDriver.JOB_OPERATION_NAME, pendingFiles);
} catch (Exception e) {
setStatus(BatchFileOperation.Status.ERROR, VariantTableDriver.JOB_OPERATION_NAME, pendingFiles);
throw e;
} finally {
Runtime.getRuntime().removeShutdownHook(hook);
}
}
public String getJarWithDependencies() throws StorageEngineException {
return getJarWithDependencies(options);
}
public static String getJarWithDependencies(ObjectMap options) throws StorageEngineException {
String jar = options.getString(OPENCGA_STORAGE_HADOOP_JAR_WITH_DEPENDENCIES, null);
if (jar == null) {
throw new StorageEngineException("Missing option " + OPENCGA_STORAGE_HADOOP_JAR_WITH_DEPENDENCIES);
}
if (!Paths.get(jar).isAbsolute()) {
jar = System.getProperty("app.home", "") + "/" + jar;
}
return jar;
}
@Override
protected void checkLoadedVariants(URI input, int fileId, StudyConfiguration studyConfiguration) throws
StorageEngineException {
logger.warn("Skip check loaded variants");
}
@Override
public URI postLoad(URI input, URI output) throws StorageEngineException {
if (options.getBoolean(HADOOP_LOAD_VARIANT)) {
// Current StudyConfiguration may be outdated. Remove it.
options.remove(VariantStorageEngine.Options.STUDY_CONFIGURATION.key());
// HadoopCredentials dbCredentials = getDbCredentials();
// VariantHadoopDBAdaptor dbAdaptor = getDBAdaptor(dbCredentials);
options.put(VariantStorageEngine.Options.FILE_ID.key(), options.getAsIntegerList(HADOOP_LOAD_VARIANT_PENDING_FILES));
return super.postLoad(input, output);
} else {
logger.debug("Skip post load");
return input;
}
}
@Override
public void securePostLoad(List<Integer> fileIds, StudyConfiguration studyConfiguration) throws StorageEngineException {
super.securePostLoad(fileIds, studyConfiguration);
BatchFileOperation.Status status = secureSetStatus(studyConfiguration, BatchFileOperation.Status.READY,
VariantTableDriver.JOB_OPERATION_NAME, fileIds);
if (status != BatchFileOperation.Status.DONE) {
logger.warn("Unexpected status " + status);
}
}
}