/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.core.variant;
import com.google.common.collect.BiMap;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFHeaderVersion;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.opencb.biodata.formats.io.FileFormatException;
import org.opencb.biodata.formats.pedigree.io.PedigreePedReader;
import org.opencb.biodata.formats.pedigree.io.PedigreeReader;
import org.opencb.biodata.formats.variant.io.VariantReader;
import org.opencb.biodata.formats.variant.io.VariantWriter;
import org.opencb.biodata.formats.variant.vcf4.FullVcfCodec;
import org.opencb.biodata.formats.variant.vcf4.io.VariantVcfReader;
import org.opencb.biodata.models.variant.*;
import org.opencb.biodata.models.variant.avro.VariantAvro;
import org.opencb.biodata.tools.variant.stats.VariantGlobalStatsCalculator;
import org.opencb.biodata.tools.variant.tasks.VariantRunner;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.io.DataWriter;
import org.opencb.commons.run.ParallelTaskRunner;
import org.opencb.commons.run.Task;
import org.opencb.hpg.bigdata.core.io.avro.AvroFileWriter;
import org.opencb.opencga.core.common.ProgressLogger;
import org.opencb.opencga.storage.core.StoragePipeline;
import org.opencb.opencga.storage.core.config.StorageConfiguration;
import org.opencb.opencga.storage.core.exceptions.StorageEngineException;
import org.opencb.opencga.storage.core.io.plain.StringDataReader;
import org.opencb.opencga.storage.core.io.plain.StringDataWriter;
import org.opencb.opencga.storage.core.metadata.BatchFileOperation;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine.Options;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor;
import org.opencb.opencga.storage.core.variant.io.VariantReaderUtils;
import org.opencb.opencga.storage.core.variant.io.json.VariantJsonWriter;
import org.opencb.opencga.storage.core.variant.transform.MalformedVariantHandler;
import org.opencb.opencga.storage.core.variant.transform.VariantAvroTransformTask;
import org.opencb.opencga.storage.core.variant.transform.VariantJsonTransformTask;
import org.opencb.opencga.storage.core.variant.transform.VariantTransformTask;
import org.slf4j.Logger;
import java.io.*;
import java.net.URI;
import java.nio.ByteBuffer;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.ExecutionException;
import java.util.function.BiConsumer;
import java.util.function.Supplier;
import java.util.zip.GZIPInputStream;
/**
* Created on 30/03/16.
*
* @author Jacobo Coll <jacobo167@gmail.com>
*/
public abstract class VariantStoragePipeline implements StoragePipeline {
private static final String HTSJDK_PARSER = "htsjdk";
protected final StorageConfiguration configuration;
protected final String storageEngineId;
protected final ObjectMap options;
protected final VariantDBAdaptor dbAdaptor;
protected final VariantReaderUtils variantReaderUtils;
protected final Logger logger;
protected final ObjectMap transformStats = new ObjectMap();
public VariantStoragePipeline(StorageConfiguration configuration, String storageEngineId, Logger logger, VariantDBAdaptor dbAdaptor,
VariantReaderUtils variantReaderUtils) {
this(configuration, storageEngineId, logger, dbAdaptor, variantReaderUtils,
new ObjectMap(configuration.getStorageEngine(storageEngineId).getVariant().getOptions()));
}
/**
* @param configuration Storage Configuration
* @param storageEngineId StorageEngineID
* @param logger Logger
* @param dbAdaptor VariantDBAdaptor. Can be null if the load step is skipped
* @param variantReaderUtils VariantReaderUtils
* @param options Unique copy of the options to be used. This object can not be shared.
*/
public VariantStoragePipeline(StorageConfiguration configuration, String storageEngineId, Logger logger, VariantDBAdaptor dbAdaptor,
VariantReaderUtils variantReaderUtils, ObjectMap options) {
this.configuration = configuration;
this.storageEngineId = storageEngineId;
this.logger = logger;
this.dbAdaptor = dbAdaptor;
this.variantReaderUtils = variantReaderUtils;
this.options = options;
if (dbAdaptor == null) {
options.put(Options.ISOLATE_FILE_FROM_STUDY_CONFIGURATION.key(), true);
}
}
@Override
public URI extract(URI input, URI ouput) {
return input;
}
@Override
public ObjectMap getTransformStats() {
return transformStats;
}
@Override
public URI preTransform(URI input) throws StorageEngineException, IOException, FileFormatException {
String fileName = VariantReaderUtils.getFileName(input);
int fileId = options.getInt(Options.FILE_ID.key(), Options.FILE_ID.defaultValue());
int studyId = options.getInt(Options.STUDY_ID.key(), Options.STUDY_ID.defaultValue());
boolean isolate = options.getBoolean(Options.ISOLATE_FILE_FROM_STUDY_CONFIGURATION.key(),
Options.ISOLATE_FILE_FROM_STUDY_CONFIGURATION.defaultValue());
StudyConfiguration studyConfiguration;
if (studyId < 0 && fileId < 0 || isolate) {
logger.debug("Isolated study configuration");
studyConfiguration = new StudyConfiguration(Options.STUDY_ID.defaultValue(), "unknown", Options.FILE_ID.defaultValue(),
fileName);
studyConfiguration.setAggregation(options.get(Options.AGGREGATED_TYPE.key(), VariantSource.Aggregation.class));
options.put(Options.ISOLATE_FILE_FROM_STUDY_CONFIGURATION.key(), true);
} else {
studyConfiguration = dbAdaptor.getStudyConfigurationManager().lockAndUpdate(studyId, existingStudyConfiguration -> {
if (existingStudyConfiguration == null) {
logger.info("Creating a new StudyConfiguration");
checkStudyId(studyId);
existingStudyConfiguration = new StudyConfiguration(studyId, options.getString(Options.STUDY_NAME.key()));
existingStudyConfiguration.setAggregation(options.get(Options.AGGREGATED_TYPE.key(),
VariantSource.Aggregation.class, Options.AGGREGATED_TYPE.defaultValue()));
}
if (existingStudyConfiguration.getAggregation() == null) {
existingStudyConfiguration.setAggregation(options.get(Options.AGGREGATED_TYPE.key(),
VariantSource.Aggregation.class, Options.AGGREGATED_TYPE.defaultValue()));
}
options.put(Options.FILE_ID.key(), checkNewFile(existingStudyConfiguration, fileId, fileName));
return existingStudyConfiguration;
});
}
options.put(Options.STUDY_CONFIGURATION.key(), studyConfiguration);
return input;
}
protected VariantSource buildVariantSource(Path input) throws StorageEngineException {
StudyConfiguration studyConfiguration = getStudyConfiguration();
Integer fileId;
if (options.getBoolean(Options.ISOLATE_FILE_FROM_STUDY_CONFIGURATION.key(), Options.ISOLATE_FILE_FROM_STUDY_CONFIGURATION
.defaultValue())) {
fileId = Options.FILE_ID.defaultValue();
} else {
fileId = options.getInt(Options.FILE_ID.key());
}
VariantSource.Aggregation aggregation = options.get(Options.AGGREGATED_TYPE.key(), VariantSource.Aggregation.class, Options
.AGGREGATED_TYPE.defaultValue());
String fileName = input.getFileName().toString();
VariantStudy.StudyType type = options.get(Options.STUDY_TYPE.key(), VariantStudy.StudyType.class,
Options.STUDY_TYPE.defaultValue());
return new VariantSource(
fileName,
fileId.toString(),
Integer.toString(studyConfiguration.getStudyId()),
studyConfiguration.getStudyName(), type, aggregation);
}
public static Pair<VCFHeader, VCFHeaderVersion> readHtsHeader(Path input) throws StorageEngineException {
try (InputStream fileInputStream = input.toString().endsWith("gz")
? new GZIPInputStream(new FileInputStream(input.toFile()))
: new FileInputStream(input.toFile())) {
FullVcfCodec codec = new FullVcfCodec();
LineIterator lineIterator = codec.makeSourceFromStream(fileInputStream);
VCFHeader header = (VCFHeader) codec.readActualHeader(lineIterator);
VCFHeaderVersion headerVersion = codec.getVCFHeaderVersion();
return new ImmutablePair<>(header, headerVersion);
} catch (IOException e) {
throw new StorageEngineException("Unable to read VCFHeader", e);
}
}
/**
* Transform raw variant files into biodata model.
*
* @param inputUri Input file. Accepted formats: *.vcf, *.vcf.gz
* @param pedigreeUri Pedigree input file. Accepted formats: *.ped
* @param outputUri The destination folder
* @throws StorageEngineException If any IO problem
*/
@Override
public URI transform(URI inputUri, URI pedigreeUri, URI outputUri) throws StorageEngineException {
// input: VcfReader
// output: JsonWriter
Path input = Paths.get(inputUri.getPath());
Path pedigree = pedigreeUri == null ? null : Paths.get(pedigreeUri.getPath());
Path output = Paths.get(outputUri.getPath());
// boolean includeSamples = options.getBoolean(Options.INCLUDE_GENOTYPES.key(), false);
boolean includeStats = options.getBoolean(Options.INCLUDE_STATS.key(), false);
// boolean includeSrc = options.getBoolean(Options.INCLUDE_SRC.key(), Options.INCLUDE_SRC.defaultValue());
boolean includeSrc = false;
boolean failOnError = options.getBoolean(Options.TRANSFORM_FAIL_ON_MALFORMED_VARIANT.key(),
Options.TRANSFORM_FAIL_ON_MALFORMED_VARIANT.defaultValue());
String format = options.getString(Options.TRANSFORM_FORMAT.key(), Options.TRANSFORM_FORMAT.defaultValue());
String parser = options.getString("transform.parser", HTSJDK_PARSER);
VariantSource source = buildVariantSource(input);
String fileName = source.getFileName();
boolean generateReferenceBlocks = options.getBoolean(Options.GVCF.key(), false);
int batchSize = options.getInt(Options.TRANSFORM_BATCH_SIZE.key(), Options.TRANSFORM_BATCH_SIZE.defaultValue());
String compression = options.getString(Options.COMPRESS_METHOD.key(), Options.COMPRESS_METHOD.defaultValue());
String extension = "";
int numTasks = options.getInt(Options.TRANSFORM_THREADS.key(), Options.TRANSFORM_THREADS.defaultValue());
int capacity = options.getInt("blockingQueueCapacity", numTasks * 2);
if ("gzip".equalsIgnoreCase(compression) || "gz".equalsIgnoreCase(compression)) {
extension = ".gz";
} else if ("snappy".equalsIgnoreCase(compression) || "snz".equalsIgnoreCase(compression)) {
extension = ".snappy";
} else if (!compression.isEmpty()) {
throw new IllegalArgumentException("Unknown compression method " + compression);
}
Path outputMalformedVariants = output.resolve(fileName + "." + VariantReaderUtils.MALFORMED_FILE + ".txt");
Path outputVariantsFile = output.resolve(fileName + "." + VariantReaderUtils.VARIANTS_FILE + "." + format + extension);
Path outputMetaFile = VariantReaderUtils.getMetaFromTransformedFile(outputVariantsFile);
// Close at the end!
final MalformedVariantHandler malformedHandler;
try {
malformedHandler = new MalformedVariantHandler(outputMalformedVariants);
} catch (IOException e) {
throw new StorageEngineException(e.getMessage(), e);
}
ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder()
.setNumTasks(numTasks)
.setBatchSize(batchSize)
.setCapacity(capacity)
.setSorted(true)
.build();
logger.info("Transforming variants using {} into {} ...", parser, format);
long start, end;
if (numTasks == 1 && "json".equals(format)) { //Run transformation with a SingleThread runner. The legacy way
if (!".gz".equals(extension)) { //FIXME: Add compatibility with snappy compression
logger.warn("Force using gzip compression");
extension = ".gz";
outputVariantsFile = output.resolve(fileName + ".variants.json" + extension);
}
//Ped Reader
PedigreeReader pedReader = null;
if (pedigree != null && pedigree.toFile().exists()) { //FIXME Add "endsWith(".ped") ??
pedReader = new PedigreePedReader(pedigree.toString());
}
//Reader
VariantReader reader = new VariantVcfReader(source, input.toAbsolutePath().toString());
//Writers
VariantJsonWriter jsonWriter = new VariantJsonWriter(source, output);
jsonWriter.includeStats(includeStats);
List<VariantWriter> writers = Collections.<VariantWriter>singletonList(jsonWriter);
//Runner
VariantRunner vr = new VariantRunner(source, reader, pedReader, writers,
Collections.<Task<Variant>>singletonList(new VariantGlobalStatsCalculator(source)), batchSize);
logger.info("Single thread transform...");
start = System.currentTimeMillis();
try {
vr.run();
} catch (IOException e) {
throw new StorageEngineException("Fail runner execution", e);
}
end = System.currentTimeMillis();
} else if ("avro".equals(format)) {
//Read VariantSource
source = VariantReaderUtils.readVariantSource(input, source);
//Reader
StringDataReader dataReader = new StringDataReader(input);
long fileSize = 0;
try {
fileSize = dataReader.getFileSize();
} catch (IOException e) {
throw new StorageEngineException("Error reading file " + input, e);
}
ProgressLogger progressLogger = new ProgressLogger("Transforming file:", fileSize, 200);
dataReader.setReadBytesListener((totalRead, delta) -> progressLogger.increment(delta, "Bytes"));
//Writer
DataWriter<ByteBuffer> dataWriter;
try {
dataWriter = new AvroFileWriter<>(VariantAvro.getClassSchema(), compression, new FileOutputStream(outputVariantsFile
.toFile()));
} catch (FileNotFoundException e) {
throw new StorageEngineException("Fail init writer", e);
}
Supplier<VariantTransformTask<ByteBuffer>> taskSupplier;
if (parser.equalsIgnoreCase(HTSJDK_PARSER)) {
logger.info("Using HTSJDK to read variants.");
FullVcfCodec codec = new FullVcfCodec();
final VariantSource finalSource = source;
Pair<VCFHeader, VCFHeaderVersion> header = readHtsHeader(input);
VariantGlobalStatsCalculator statsCalculator = new VariantGlobalStatsCalculator(source);
taskSupplier = () -> new VariantAvroTransformTask(header.getKey(), header.getValue(), finalSource, outputMetaFile,
statsCalculator, includeSrc, generateReferenceBlocks)
.setFailOnError(failOnError).addMalformedErrorHandler(malformedHandler);
} else {
// TODO Create a utility to determine which extensions are variants files
final VariantVcfFactory factory = createVariantVcfFactory(source, fileName);
logger.info("Using Biodata to read variants.");
final VariantSource finalSource = source;
VariantGlobalStatsCalculator statsCalculator = new VariantGlobalStatsCalculator(source);
taskSupplier = () -> new VariantAvroTransformTask(factory, finalSource, outputMetaFile, statsCalculator, includeSrc)
.setFailOnError(failOnError).addMalformedErrorHandler(malformedHandler);
}
logger.info("Generating output file {}", outputVariantsFile);
ParallelTaskRunner<String, ByteBuffer> ptr;
try {
ptr = new ParallelTaskRunner<>(
dataReader,
taskSupplier,
dataWriter,
config
);
} catch (Exception e) {
throw new StorageEngineException("Error while creating ParallelTaskRunner", e);
}
logger.info("Multi thread transform... [1 reading, {} transforming, 1 writing]", numTasks);
start = System.currentTimeMillis();
try {
ptr.run();
} catch (ExecutionException e) {
throw new StorageEngineException("Error while executing TransformVariants in ParallelTaskRunner", e);
}
end = System.currentTimeMillis();
} else if ("json".equals(format)) {
//Read VariantSource
source = VariantReaderUtils.readVariantSource(input, source);
//Reader
StringDataReader dataReader = new StringDataReader(input);
long fileSize = 0;
try {
fileSize = dataReader.getFileSize();
} catch (IOException e) {
throw new StorageEngineException("Error reading file " + input, e);
}
ProgressLogger progressLogger = new ProgressLogger("Transforming file:", fileSize, 200);
dataReader.setReadBytesListener((totalRead, delta) -> progressLogger.increment(delta, "Bytes"));
//Writers
StringDataWriter dataWriter = new StringDataWriter(outputVariantsFile, true);
final VariantSource finalSource = source;
ParallelTaskRunner<String, String> ptr;
Supplier<VariantTransformTask<String>> taskSupplier;
if (parser.equalsIgnoreCase(HTSJDK_PARSER)) {
logger.info("Using HTSJDK to read variants.");
Pair<VCFHeader, VCFHeaderVersion> header = readHtsHeader(input);
VariantGlobalStatsCalculator statsCalculator = new VariantGlobalStatsCalculator(finalSource);
taskSupplier = () -> new VariantJsonTransformTask(header.getKey(), header.getValue(), finalSource,
outputMetaFile, statsCalculator, includeSrc, generateReferenceBlocks)
.setFailOnError(failOnError).addMalformedErrorHandler(malformedHandler);
} else {
// TODO Create a utility to determine which extensions are variants files
final VariantVcfFactory factory = createVariantVcfFactory(source, fileName);
logger.info("Using Biodata to read variants.");
VariantGlobalStatsCalculator statsCalculator = new VariantGlobalStatsCalculator(source);
taskSupplier = () -> new VariantJsonTransformTask(factory, finalSource, outputMetaFile, statsCalculator, includeSrc)
.setFailOnError(failOnError).addMalformedErrorHandler(malformedHandler);
}
logger.info("Generating output file {}", outputVariantsFile);
try {
ptr = new ParallelTaskRunner<>(
dataReader,
taskSupplier,
dataWriter,
config
);
} catch (Exception e) {
throw new StorageEngineException("Error while creating ParallelTaskRunner", e);
}
logger.info("Multi thread transform... [1 reading, {} transforming, 1 writing]", numTasks);
start = System.currentTimeMillis();
try {
ptr.run();
} catch (ExecutionException e) {
throw new StorageEngineException("Error while executing TransformVariants in ParallelTaskRunner", e);
}
end = System.currentTimeMillis();
} else if ("proto".equals(format)) {
//Read VariantSource
source = VariantReaderUtils.readVariantSource(input, source);
Pair<Long, Long> times = processProto(input, fileName, output, source, outputVariantsFile, outputMetaFile,
includeSrc, parser, generateReferenceBlocks, batchSize, extension, compression, malformedHandler, failOnError);
start = times.getKey();
end = times.getValue();
} else {
throw new IllegalArgumentException("Unknown format " + format);
}
logger.info("end - start = " + (end - start) / 1000.0 + "s");
logger.info("Variants transformed!");
// Close the malformed variant handler
malformedHandler.close();
if (malformedHandler.getMalformedLines() > 0) {
getTransformStats().put("malformed lines", malformedHandler.getMalformedLines());
}
return outputUri.resolve(outputVariantsFile.getFileName().toString());
}
protected VariantVcfFactory createVariantVcfFactory(VariantSource source, String fileName) throws StorageEngineException {
VariantVcfFactory factory;
if (fileName.endsWith(".vcf") || fileName.endsWith(".vcf.gz") || fileName.endsWith(".vcf.snappy")) {
if (VariantSource.Aggregation.NONE.equals(source.getAggregation())) {
factory = new VariantVcfFactory();
} else {
factory = new VariantAggregatedVcfFactory();
}
} else {
throw new StorageEngineException("Variants input file format not supported");
}
return factory;
}
protected Pair<Long, Long> processProto(
Path input, String fileName, Path output, VariantSource source, Path outputVariantsFile,
Path outputMetaFile, boolean includeSrc, String parser, boolean generateReferenceBlocks,
int batchSize, String extension, String compression, BiConsumer<String, RuntimeException> malformatedHandler,
boolean failOnError)
throws StorageEngineException {
throw new NotImplementedException("Please request feature");
}
@Override
public URI postTransform(URI input) throws IOException, FileFormatException {
// Delete isolated storage configuration
if (options.getBoolean(Options.ISOLATE_FILE_FROM_STUDY_CONFIGURATION.key())) {
options.remove(Options.STUDY_CONFIGURATION.key());
}
return input;
}
@Override
public URI preLoad(URI input, URI output) throws StorageEngineException {
int studyId = options.getInt(Options.STUDY_ID.key(), -1);
options.remove(Options.STUDY_CONFIGURATION.key());
//Get the studyConfiguration. If there is no StudyConfiguration, create a empty one.
dbAdaptor.getStudyConfigurationManager().lockAndUpdate(studyId, studyConfiguration -> {
studyConfiguration = checkOrCreateStudyConfiguration(studyConfiguration);
VariantSource source = readVariantSource(input, options);
securePreLoad(studyConfiguration, source);
options.put(Options.STUDY_CONFIGURATION.key(), studyConfiguration);
return studyConfiguration;
});
return input;
}
/**
* PreLoad step for modify the StudyConfiguration.
* This step is executed inside a study lock.
*
* @see StudyConfigurationManager#lockStudy(int)
* @param studyConfiguration StudyConfiguration
* @param source VariantSource
* @throws StorageEngineException If any condition is wrong
*/
protected void securePreLoad(StudyConfiguration studyConfiguration, VariantSource source) throws StorageEngineException {
/*
* Before load file, check and add fileName to the StudyConfiguration.
* FileID and FileName is read from the VariantSource
* If fileId is -1, read fileId from Options
* Will fail if:
* fileId is not an integer
* fileId was already in the studyConfiguration.indexedFiles
* fileId was already in the studyConfiguration.fileIds with a different fileName
* fileName was already in the studyConfiguration.fileIds with a different fileId
*/
int fileId;
String fileName = source.getFileName();
try {
fileId = Integer.parseInt(source.getFileId());
} catch (NumberFormatException e) {
throw new StorageEngineException("FileId '" + source.getFileId() + "' is not an integer", e);
}
if (fileId < 0) {
fileId = options.getInt(Options.FILE_ID.key(), Options.FILE_ID.defaultValue());
} else {
int fileIdFromParams = options.getInt(Options.FILE_ID.key(), Options.FILE_ID.defaultValue());
if (fileIdFromParams >= 0 && fileIdFromParams != fileId) {
if (!options.getBoolean(Options.OVERRIDE_FILE_ID.key(), Options.OVERRIDE_FILE_ID.defaultValue())) {
throw new StorageEngineException("Wrong fileId! Unable to load using fileId: "
+ fileIdFromParams + ". "
+ "The input file has fileId: " + fileId
+ ". Use " + Options.OVERRIDE_FILE_ID.key() + " to ignore original fileId.");
} else {
//Override the fileId
fileId = fileIdFromParams;
}
}
}
if (studyConfiguration.getIndexedFiles().isEmpty()) {
// First indexed file
// Use the EXCLUDE_GENOTYPES value from CLI. Write in StudyConfiguration.attributes
boolean excludeGenotypes = options.getBoolean(Options.EXCLUDE_GENOTYPES.key(), Options.EXCLUDE_GENOTYPES.defaultValue());
studyConfiguration.setAggregation(options.get(Options.AGGREGATED_TYPE.key(), VariantSource.Aggregation.class,
Options.AGGREGATED_TYPE.defaultValue()));
studyConfiguration.getAttributes().put(Options.EXCLUDE_GENOTYPES.key(), excludeGenotypes);
} else {
// Not first indexed file
// Use the EXCLUDE_GENOTYPES value from StudyConfiguration. Ignore CLI value
boolean excludeGenotypes = studyConfiguration.getAttributes()
.getBoolean(Options.EXCLUDE_GENOTYPES.key(), Options.EXCLUDE_GENOTYPES.defaultValue());
options.put(Options.EXCLUDE_GENOTYPES.key(), excludeGenotypes);
}
fileId = checkNewFile(studyConfiguration, fileId, fileName);
options.put(Options.FILE_ID.key(), fileId);
studyConfiguration.getFileIds().put(source.getFileName(), fileId);
// studyConfiguration.getHeaders().put(fileId, source.getMetadata().get(VariantFileUtils.VARIANT_FILE_HEADER).toString());
checkAndUpdateStudyConfiguration(studyConfiguration, fileId, source, options);
// Check Extra genotype fields
if (options.containsKey(Options.EXTRA_GENOTYPE_FIELDS.key())
&& StringUtils.isNotEmpty(options.getString(Options.EXTRA_GENOTYPE_FIELDS.key()))) {
List<String> extraFields = options.getAsStringList(Options.EXTRA_GENOTYPE_FIELDS.key());
if (studyConfiguration.getIndexedFiles().isEmpty()) {
studyConfiguration.getAttributes().put(Options.EXTRA_GENOTYPE_FIELDS.key(), extraFields);
} else {
if (!extraFields.equals(studyConfiguration.getAttributes().getAsStringList(Options.EXTRA_GENOTYPE_FIELDS.key()))) {
throw new StorageEngineException("Unable to change Stored Extra Fields if there are already indexed files.");
}
}
if (!studyConfiguration.getAttributes().containsKey(Options.EXTRA_GENOTYPE_FIELDS_TYPE.key())) {
List<String> extraFieldsType = new ArrayList<>(extraFields.size());
for (String extraField : extraFields) {
List<Map<String, Object>> formats = (List) source.getHeader().getMeta().get("FORMAT");
VCFHeaderLineType type = VCFHeaderLineType.String;
for (Map<String, Object> format : formats) {
if (format.get("ID").toString().equals(extraField)) {
if ("1".equals(format.get("Number"))) {
try {
type = VCFHeaderLineType.valueOf(Objects.toString(format.get("Type")));
} catch (IllegalArgumentException ignore) {
type = VCFHeaderLineType.String;
}
} else {
//Fields with arity != 1 are loaded as String
type = VCFHeaderLineType.String;
}
break;
}
}
switch (type) {
case String:
case Float:
case Integer:
break;
case Character:
default:
type = VCFHeaderLineType.String;
break;
}
extraFieldsType.add(type.toString());
logger.debug(extraField + " : " + type);
}
studyConfiguration.getAttributes().put(Options.EXTRA_GENOTYPE_FIELDS_TYPE.key(), extraFieldsType);
}
}
}
protected StudyConfiguration checkOrCreateStudyConfiguration(boolean forceFetch) throws StorageEngineException {
return checkOrCreateStudyConfiguration(getStudyConfiguration(forceFetch));
}
protected StudyConfiguration checkOrCreateStudyConfiguration(StudyConfiguration studyConfiguration) throws StorageEngineException {
if (studyConfiguration == null) {
logger.info("Creating a new StudyConfiguration");
int studyId = options.getInt(Options.STUDY_ID.key(), Options.STUDY_ID.defaultValue());
String studyName = options.getString(Options.STUDY_NAME.key(), Options.STUDY_NAME.defaultValue());
checkStudyId(studyId);
studyConfiguration = new StudyConfiguration(studyId, studyName);
options.put(Options.STUDY_CONFIGURATION.key(), studyConfiguration);
}
return studyConfiguration;
}
/*
* Before load file, the StudyConfiguration has to be updated with the new sample names.
* Will read param SAMPLE_IDS like [<sampleName>:<sampleId>,]*
* If SAMPLE_IDS is missing, will auto-generate sampleIds
* Will fail if:
* param SAMPLE_IDS is malformed
* any given sampleId is not an integer
* any given sampleName is not in the input file
* any given sampleName was already in the StudyConfiguration (so, was already loaded)
* some sample was missing in the given SAMPLE_IDS param
*
*/
public static void checkAndUpdateStudyConfiguration(StudyConfiguration studyConfiguration, int fileId, VariantSource source,
ObjectMap options)
throws StorageEngineException {
if (options.containsKey(Options.SAMPLE_IDS.key()) && !options.getAsStringList(Options.SAMPLE_IDS.key()).isEmpty()) {
for (String sampleEntry : options.getAsStringList(Options.SAMPLE_IDS.key())) {
String[] split = sampleEntry.split(":");
if (split.length != 2) {
throw new StorageEngineException("Param " + sampleEntry + " is malformed");
}
String sampleName = split[0];
int sampleId;
try {
sampleId = Integer.parseInt(split[1]);
} catch (NumberFormatException e) {
throw new StorageEngineException("SampleId " + split[1] + " is not an integer", e);
}
if (!source.getSamplesPosition().containsKey(sampleName)) {
//ERROR
throw new StorageEngineException("Given sampleName '" + sampleName + "' is not in the input file");
} else {
if (!studyConfiguration.getSampleIds().containsKey(sampleName)) {
//Add sample to StudyConfiguration
studyConfiguration.getSampleIds().put(sampleName, sampleId);
} else {
if (studyConfiguration.getSampleIds().get(sampleName) != sampleId) {
throw new StorageEngineException("Sample " + sampleName + ":" + sampleId
+ " was already present. It was in the StudyConfiguration with a different sampleId: "
+ studyConfiguration.getSampleIds().get(sampleName));
}
}
}
}
//Check that all samples has a sampleId
List<String> missingSamples = new LinkedList<>();
for (String sampleName : source.getSamples()) {
if (!studyConfiguration.getSampleIds().containsKey(sampleName)) {
missingSamples.add(sampleName);
} /*else {
Integer sampleId = studyConfiguration.getSampleIds().get(sampleName);
if (studyConfiguration.getIndexedSamples().contains(sampleId)) {
logger.warn("Sample " + sampleName + ":" + sampleId + " was already loaded.
It was in the StudyConfiguration.indexedSamples");
}
}*/
}
if (!missingSamples.isEmpty()) {
throw new StorageEngineException("Samples " + missingSamples.toString() + " has not assigned sampleId");
}
} else {
//Find the grader sample Id in the studyConfiguration, in order to add more sampleIds if necessary.
int maxId = 0;
for (Integer i : studyConfiguration.getSampleIds().values()) {
if (i > maxId) {
maxId = i;
}
}
//Assign new sampleIds
for (String sample : source.getSamples()) {
if (!studyConfiguration.getSampleIds().containsKey(sample)) {
//If the sample was not in the original studyId, a new SampleId is assigned.
int sampleId;
int samplesSize = studyConfiguration.getSampleIds().size();
Integer samplePosition = source.getSamplesPosition().get(sample);
if (!studyConfiguration.getSampleIds().containsValue(samplePosition)) {
//1- Use with the SamplePosition
sampleId = samplePosition;
} else if (!studyConfiguration.getSampleIds().containsValue(samplesSize)) {
//2- Use the number of samples in the StudyConfiguration.
sampleId = samplesSize;
} else {
//3- Use the maxId
sampleId = maxId + 1;
}
studyConfiguration.getSampleIds().put(sample, sampleId);
if (sampleId > maxId) {
maxId = sampleId;
}
}
}
}
if (studyConfiguration.getSamplesInFiles().containsKey(fileId)) {
LinkedHashSet<Integer> sampleIds = studyConfiguration.getSamplesInFiles().get(fileId);
List<String> missingSamples = new LinkedList<>();
for (String sampleName : source.getSamples()) {
if (!sampleIds.contains(studyConfiguration.getSampleIds().get(sampleName))) {
missingSamples.add(sampleName);
}
}
if (!missingSamples.isEmpty()) {
throw new StorageEngineException("Samples " + missingSamples.toString() + " were not in file " + fileId);
}
if (sampleIds.size() != source.getSamples().size()) {
throw new StorageEngineException("Incorrect number of samples in file " + fileId);
}
} else {
LinkedHashSet<Integer> sampleIdsInFile = new LinkedHashSet<>(source.getSamples().size());
for (String sample : source.getSamples()) {
sampleIdsInFile.add(studyConfiguration.getSampleIds().get(sample));
}
studyConfiguration.getSamplesInFiles().put(fileId, sampleIdsInFile);
}
}
@Override
public URI postLoad(URI input, URI output) throws StorageEngineException {
// ObjectMap options = configuration.getStorageEngine(storageEngineId).getVariant().getOptions();
List<Integer> fileIds = options.getAsIntegerList(Options.FILE_ID.key());
int studyId = options.getInt(Options.STUDY_ID.key(), -1);
long lock = dbAdaptor.getStudyConfigurationManager().lockStudy(studyId);
// Check loaded variants BEFORE updating the StudyConfiguration
checkLoadedVariants(input, fileIds, getStudyConfiguration());
StudyConfiguration studyConfiguration;
try {
//Update StudyConfiguration
studyConfiguration = getStudyConfiguration(true);
securePostLoad(fileIds, studyConfiguration);
dbAdaptor.getStudyConfigurationManager().updateStudyConfiguration(studyConfiguration, new QueryOptions());
} finally {
dbAdaptor.getStudyConfigurationManager().unLockStudy(studyId, lock);
}
return input;
}
public void securePostLoad(List<Integer> fileIds, StudyConfiguration studyConfiguration) throws StorageEngineException {
// Update indexed files
studyConfiguration.getIndexedFiles().addAll(fileIds);
// Update the cohort ALL. Invalidate if needed
String defaultCohortName = StudyEntry.DEFAULT_COHORT;
BiMap<String, Integer> indexedSamples = StudyConfiguration.getIndexedSamples(studyConfiguration);
final Integer defaultCohortId;
if (studyConfiguration.getCohortIds().containsKey(defaultCohortName)) { //Check if "defaultCohort" exists
defaultCohortId = studyConfiguration.getCohortIds().get(defaultCohortName);
if (studyConfiguration.getCalculatedStats().contains(defaultCohortId)) { //Check if "defaultCohort" is calculated
//Check if the samples number are different
if (!indexedSamples.values().equals(studyConfiguration.getCohorts().get(defaultCohortId))) {
logger.debug("Cohort \"{}\":{} was already calculated. Invalidating stats.",
defaultCohortName, defaultCohortId);
studyConfiguration.getCalculatedStats().remove(defaultCohortId);
studyConfiguration.getInvalidStats().add(defaultCohortId);
}
}
} else {
// Default cohort does not exist. Create cohort.
defaultCohortId = studyConfiguration.getCohortIds().values().stream().max(Integer::compareTo).orElse(1);
studyConfiguration.getCohortIds().put(StudyEntry.DEFAULT_COHORT, defaultCohortId);
}
logger.info("Add loaded samples to Default Cohort \"" + defaultCohortName + '"');
studyConfiguration.getCohorts().put(defaultCohortId, indexedSamples.values());
}
@Override
public void close() throws StorageEngineException {
if (dbAdaptor != null) {
try {
dbAdaptor.close();
} catch (IOException e) {
throw new StorageEngineException("Error closing DBAdaptor", e);
}
}
}
protected abstract void checkLoadedVariants(URI input, int fileId, StudyConfiguration studyConfiguration)
throws StorageEngineException;
protected void checkLoadedVariants(URI input, List<Integer> fileIds, StudyConfiguration studyConfiguration)
throws StorageEngineException {
for (Integer fileId : fileIds) {
checkLoadedVariants(input, fileId, studyConfiguration);
}
}
public static String buildFilename(String studyName, int fileId) {
int index = studyName.indexOf(":");
if (index >= 0) {
studyName = studyName.substring(index + 1);
}
return studyName + "_" + fileId;
}
public VariantSource readVariantSource(URI input, ObjectMap options) throws StorageEngineException {
return variantReaderUtils.readVariantSource(input);
}
/* --------------------------------------- */
/* StudyConfiguration utils methods */
/* --------------------------------------- */
public final StudyConfiguration getStudyConfiguration() throws StorageEngineException {
return getStudyConfiguration(false);
}
/**
* Reads the study configuration.
*
* @param forceFetch If true, forces to get the StudyConfiguration from the database. Ignores current one.
* @return The study configuration.
* @throws StorageEngineException If the study configuration is not found
*/
public final StudyConfiguration getStudyConfiguration(boolean forceFetch) throws StorageEngineException {
// TODO: should StudyConfiguration be a class field?
if (!forceFetch && options.containsKey(Options.STUDY_CONFIGURATION.key())) {
return options.get(Options.STUDY_CONFIGURATION.key(), StudyConfiguration.class);
} else {
StudyConfigurationManager studyConfigurationManager = dbAdaptor.getStudyConfigurationManager();
StudyConfiguration studyConfiguration;
if (!StringUtils.isEmpty(options.getString(Options.STUDY_NAME.key()))
&& !options.getString(Options.STUDY_NAME.key()).equals(Options.STUDY_NAME.defaultValue())) {
studyConfiguration = studyConfigurationManager.getStudyConfiguration(options.getString(Options.STUDY_NAME.key()),
new QueryOptions(options)).first();
if (studyConfiguration != null && options.containsKey(Options.STUDY_ID.key())) {
//Check if StudyId matches
if (studyConfiguration.getStudyId() != options.getInt(Options.STUDY_ID.key())) {
throw new StorageEngineException("Invalid StudyConfiguration. StudyId mismatches");
}
}
} else if (options.containsKey(Options.STUDY_ID.key())) {
studyConfiguration = studyConfigurationManager.getStudyConfiguration(options.getInt(Options.STUDY_ID.key()),
new QueryOptions(options)).first();
} else {
throw new StorageEngineException("Unable to get StudyConfiguration. Missing studyId or studyName");
}
options.put(Options.STUDY_CONFIGURATION.key(), studyConfiguration);
return studyConfiguration;
}
}
/**
* Check if the file(name,id) can be added to the StudyConfiguration.
*
* Will fail if:
* fileName was already in the studyConfiguration.fileIds with a different fileId
* fileId was already in the studyConfiguration.fileIds with a different fileName
* fileId was already in the studyConfiguration.indexedFiles
*
* @param studyConfiguration Study Configuration
* @param fileId FileId to add. If negative, will generate a new one
* @param fileName File name
* @return fileId related to that file.
* @throws StorageEngineException if the file is not valid for being loaded
*/
protected int checkNewFile(StudyConfiguration studyConfiguration, int fileId, String fileName) throws StorageEngineException {
Map<Integer, String> idFiles = StudyConfiguration.inverseMap(studyConfiguration.getFileIds());
if (fileId < 0) {
if (studyConfiguration.getFileIds().containsKey(fileName)) {
fileId = studyConfiguration.getFileIds().get(fileName);
} else {
fileId = studyConfiguration.getFileIds().values().stream().max(Integer::compareTo).orElse(0) + 1;
studyConfiguration.getFileIds().put(fileName, fileId);
}
//throw new StorageEngineException("Invalid fileId " + fileId + " for file " + fileName + ". FileId must be positive.");
}
if (studyConfiguration.getFileIds().containsKey(fileName)) {
if (studyConfiguration.getFileIds().get(fileName) != fileId) {
throw new StorageEngineException("File " + fileName + " (" + fileId + ") "
+ "has a different fileId in the study "
+ studyConfiguration.getStudyName() + " (" + studyConfiguration.getStudyId() + ") : "
+ fileName + " (" + studyConfiguration.getFileIds().get(fileName) + ")");
}
}
if (idFiles.containsKey(fileId)) {
if (!idFiles.get(fileId).equals(fileName)) {
throw new StorageEngineException("File " + fileName + " (" + fileId + ") "
+ "has a different fileName in the StudyConfiguration: "
+ idFiles.get(fileId) + " (" + fileId + ")");
}
}
if (studyConfiguration.getIndexedFiles().contains(fileId)) {
throw StorageEngineException.alreadyLoaded(fileId, fileName);
}
return fileId;
}
/**
* Check if the StudyConfiguration is correct.
*
* @param studyConfiguration StudyConfiguration to check
* @throws StorageEngineException If object is null
*/
public static void checkStudyConfiguration(StudyConfiguration studyConfiguration) throws StorageEngineException {
if (studyConfiguration == null) {
throw new StorageEngineException("StudyConfiguration is null");
}
checkStudyId(studyConfiguration.getStudyId());
if (studyConfiguration.getFileIds().size() != StudyConfiguration.inverseMap(studyConfiguration.getFileIds()).size()) {
throw new StorageEngineException("StudyConfiguration has duplicated fileIds");
}
if (studyConfiguration.getCohortIds().size() != StudyConfiguration.inverseMap(studyConfiguration.getCohortIds()).size()) {
throw new StorageEngineException("StudyConfiguration has duplicated cohortIds");
}
}
public static void checkStudyId(int studyId) throws StorageEngineException {
if (studyId < 0) {
throw new StorageEngineException("Invalid studyId : " + studyId);
}
}
public Thread newShutdownHook(String jobOperationName, List<Integer> files) {
return new Thread(() -> {
try {
logger.error("Shutdown hook!");
setStatus(BatchFileOperation.Status.ERROR, jobOperationName, files);
} catch (StorageEngineException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
});
}
public void setStatus(BatchFileOperation.Status status, String operationName, List<Integer> files) throws StorageEngineException {
int studyId = getStudyId();
long lock = dbAdaptor.getStudyConfigurationManager().lockStudy(studyId);
try {
StudyConfiguration studyConfiguration = getStudyConfiguration(true);
secureSetStatus(studyConfiguration, status, operationName, files);
dbAdaptor.getStudyConfigurationManager().updateStudyConfiguration(studyConfiguration, null);
} finally {
dbAdaptor.getStudyConfigurationManager().unLockStudy(studyId, lock);
}
}
public BatchFileOperation.Status secureSetStatus(StudyConfiguration studyConfiguration, BatchFileOperation.Status status,
String operationName, List<Integer> files)
throws StorageEngineException {
List<BatchFileOperation> batches = studyConfiguration.getBatches();
BatchFileOperation operation = null;
for (int i = batches.size() - 1; i >= 0; i--) {
operation = batches.get(i);
if (operation.getOperationName().equals(operationName) && operation.getFileIds().equals(files)) {
break;
}
operation = null;
}
if (operation == null) {
throw new IllegalStateException("Batch operation " + operationName + " for files " + files + " not found!");
}
BatchFileOperation.Status previousStatus = operation.currentStatus();
operation.addStatus(Calendar.getInstance().getTime(), status);
return previousStatus;
}
public VariantDBAdaptor getDBAdaptor() {
return dbAdaptor;
}
protected int getStudyId() {
return options.getInt(Options.STUDY_ID.key());
}
public ObjectMap getOptions() {
return options;
}
}