/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.core.variant; import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.VariantSource; import org.opencb.biodata.models.variant.VariantStudy; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.opencga.core.common.TimeUtils; import org.opencb.opencga.storage.core.StorageEngine; import org.opencb.opencga.storage.core.StoragePipelineResult; import org.opencb.opencga.storage.core.config.StorageConfiguration; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.exceptions.StoragePipelineException; import org.opencb.opencga.storage.core.exceptions.VariantSearchException; import org.opencb.opencga.storage.core.metadata.ExportMetadata; import org.opencb.opencga.storage.core.metadata.FileStudyConfigurationManager; import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager; import org.opencb.opencga.storage.core.search.VariantSearchManager; import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor; import org.opencb.opencga.storage.core.variant.adaptors.VariantDBIterator; import org.opencb.opencga.storage.core.variant.adaptors.VariantField; import org.opencb.opencga.storage.core.variant.annotation.DefaultVariantAnnotationManager; import org.opencb.opencga.storage.core.variant.annotation.VariantAnnotationManager; import org.opencb.opencga.storage.core.variant.annotation.VariantAnnotatorException; import org.opencb.opencga.storage.core.variant.annotation.annotators.VariantAnnotator; import org.opencb.opencga.storage.core.variant.annotation.annotators.VariantAnnotatorFactory; import org.opencb.opencga.storage.core.variant.io.VariantExporter; import org.opencb.opencga.storage.core.variant.io.VariantImporter; import org.opencb.opencga.storage.core.variant.io.VariantReaderUtils; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat; import org.opencb.opencga.storage.core.variant.stats.DefaultVariantStatisticsManager; import org.opencb.opencga.storage.core.variant.stats.VariantStatisticsManager; import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URI; import java.nio.file.Path; import java.util.*; /** * Created by imedina on 13/08/14. */ public abstract class VariantStorageEngine extends StorageEngine<VariantDBAdaptor> { protected VariantSearchManager variantSearchManager; public enum Options { INCLUDE_STATS("include.stats", true), //Include existing stats on the original file. // @Deprecated // INCLUDE_GENOTYPES("include.genotypes", true), //Include sample information (genotypes) EXTRA_GENOTYPE_FIELDS("include.extra-fields", ""), //Include other sample information (like DP, GQ, ...) EXTRA_GENOTYPE_FIELDS_TYPE("include.extra-fields-format", ""), //Other sample information format (String, Integer, Float) EXTRA_GENOTYPE_FIELDS_COMPRESS("extra-fields.compress", true), //Compress with gzip other sample information // @Deprecated // INCLUDE_SRC("include.src", false), //Include original source file on the transformed file and the final db // COMPRESS_GENOTYPES ("compressGenotypes", true), //Stores sample information as compressed genotypes EXCLUDE_GENOTYPES("exclude.genotypes", false), //Do not store genotypes from samples STUDY_CONFIGURATION("studyConfiguration", ""), // STUDY_CONFIGURATION_MANAGER_CLASS_NAME("studyConfigurationManagerClassName", ""), STUDY_TYPE("studyType", VariantStudy.StudyType.CASE_CONTROL), AGGREGATED_TYPE("aggregatedType", VariantSource.Aggregation.NONE), STUDY_NAME("studyName", "default"), STUDY_ID("studyId", -1), FILE_ID("fileId", -1), OVERRIDE_FILE_ID("overrideFileId", false), SAMPLE_IDS("sampleIds", ""), GVCF("gvcf", false), ISOLATE_FILE_FROM_STUDY_CONFIGURATION("isolateStudyConfiguration", false), TRANSFORM_FAIL_ON_MALFORMED_VARIANT("transform.fail.on.malformed", false), COMPRESS_METHOD("compressMethod", "gzip"), AGGREGATION_MAPPING_PROPERTIES("aggregationMappingFile", null), DB_NAME("database.name", "opencga"), TRANSFORM_BATCH_SIZE("transform.batch.size", 200), TRANSFORM_THREADS("transform.threads", 4), TRANSFORM_FORMAT("transform.format", "avro"), LOAD_BATCH_SIZE("load.batch.size", 100), LOAD_THREADS("load.threads", 6), CALCULATE_STATS("calculateStats", false), //Calculate stats on the postLoad step OVERWRITE_STATS("overwriteStats", false), //Overwrite stats already present UPDATE_STATS("updateStats", false), //Calculate missing stats ANNOTATE("annotate", false), RESUME("resume", false), DEFAULT_TIMEOUT("dbadaptor.default_timeout", 10000), // Default timeout for DBAdaptor operations. Only used if none is provided. MAX_TIMEOUT("dbadaptor.max_timeout", 30000); // Max allowed timeout for DBAdaptor operations private final String key; private final Object value; Options(String key, Object value) { this.key = key; this.value = value; } public String key() { return key; } @SuppressWarnings("unchecked") public <T> T defaultValue() { return (T) value; } } @Deprecated public VariantStorageEngine() { logger = LoggerFactory.getLogger(VariantStorageEngine.class); } public VariantStorageEngine(StorageConfiguration configuration) { this(configuration.getDefaultStorageEngineId(), configuration); } public VariantStorageEngine(String storageEngineId, StorageConfiguration configuration) { super(storageEngineId, configuration); variantSearchManager = new VariantSearchManager(null, configuration); logger = LoggerFactory.getLogger(VariantStorageEngine.class); } /** * Loads the given file into an empty database. * * The input file should have, in the same directory, a metadata file, with the same name ended with * {@link VariantExporter#METADATA_FILE_EXTENSION} * * @param inputFile Variants input file in avro format. * @param dbName Database name where to load the variants * @param options Other options * @throws IOException if there is any I/O error * @throws StorageEngineException if there si any error loading the variants * */ public void importData(URI inputFile, String dbName, ObjectMap options) throws StorageEngineException, IOException { try (VariantDBAdaptor dbAdaptor = getDBAdaptor(dbName)) { VariantImporter variantImporter = newVariantImporter(dbAdaptor); variantImporter.importData(inputFile); } } /** * Loads the given file into an empty database. * * @param inputFile Variants input file in avro format. * @param metadata Metadata related with the data to be loaded. * @param studiesOldNewMap Map from old to new StudyConfiguration, in case of name remapping * @param dbName Database name where to load the variants * @param options Other options * @throws IOException if there is any I/O error * @throws StorageEngineException if there si any error loading the variants * */ public void importData(URI inputFile, ExportMetadata metadata, Map<StudyConfiguration, StudyConfiguration> studiesOldNewMap, String dbName, ObjectMap options) throws StorageEngineException, IOException { try (VariantDBAdaptor dbAdaptor = getDBAdaptor(dbName)) { VariantImporter variantImporter = newVariantImporter(dbAdaptor); variantImporter.importData(inputFile, metadata, studiesOldNewMap); } } /** * Creates a new {@link VariantImporter} for the current backend. * * There is no default VariantImporter. * * @param dbAdaptor DBAdaptor to the current database * @return new VariantImporter */ protected VariantImporter newVariantImporter(VariantDBAdaptor dbAdaptor) { throw new UnsupportedOperationException(); } /** * Exports the result of the given query and the associated metadata. * @param outputFile Optional output file. If null or empty, will print into the Standard output. Won't export any metadata. * @param outputFormat Variant output format * @param dbName DBName for reading the variants * @param query Query with the variants to export * @param queryOptions Query options * @throws IOException If there is any IO error * @throws StorageEngineException If there is any error exporting variants */ public void exportData(URI outputFile, VariantOutputFormat outputFormat, String dbName, Query query, QueryOptions queryOptions) throws IOException, StorageEngineException { try (VariantDBAdaptor dbAdaptor = getDBAdaptor(dbName)) { VariantExporter exporter = newVariantExporter(dbAdaptor); exporter.export(outputFile, outputFormat, query, queryOptions); } } /** * Creates a new {@link VariantExporter} for the current backend. * The default implementation iterates locally through the database. * * @param dbAdaptor DBAdaptor to the current database * @return new VariantExporter */ protected VariantExporter newVariantExporter(VariantDBAdaptor dbAdaptor) { return new VariantExporter(dbAdaptor); } /** * Index the given input files. By default, executes the steps in {@link VariantStoragePipeline}. * * Will create a {@link #newStoragePipeline} for each input file. * * @param inputFiles Input files to index * @param outdirUri Output directory for possible intermediate files * @param doExtract Execute extract step {@link VariantStoragePipeline#extract} * @param doTransform Execute transform step {@link VariantStoragePipeline#transform} * @param doLoad Execute load step {@link VariantStoragePipeline#load} * @return List of {@link StoragePipelineResult}, one for each input file. * @throws StorageEngineException If there is any problem related with the StorageEngine */ @Override public List<StoragePipelineResult> index(List<URI> inputFiles, URI outdirUri, boolean doExtract, boolean doTransform, boolean doLoad) throws StorageEngineException { List<StoragePipelineResult> results = super.index(inputFiles, outdirUri, doExtract, doTransform, doLoad); if (doLoad) { annotateLoadedFiles(outdirUri, inputFiles, results, getOptions()); calculateStatsForLoadedFiles(outdirUri, inputFiles, results, getOptions()); } return results; } @Override public abstract VariantStoragePipeline newStoragePipeline(boolean connected) throws StorageEngineException; /** * Given a dbName, calculates the annotation for all the variants that matches with a given query, and loads them into the database. * * @param dbName database name to annotate. * @param query Query to select variants to annotate * @param params Other params * @throws VariantAnnotatorException If the annotation goes wrong * @throws StorageEngineException If there is any problem related with the StorageEngine * @throws IOException If there is any IO problem */ public void annotate(String dbName, Query query, ObjectMap params) throws VariantAnnotatorException, StorageEngineException, IOException { VariantAnnotator annotator = VariantAnnotatorFactory.buildVariantAnnotator(configuration, getStorageEngineId(), params); try (VariantDBAdaptor dbAdaptor = getDBAdaptor(dbName)) { VariantAnnotationManager annotationManager = newVariantAnnotationManager(annotator, dbAdaptor); annotationManager.annotate(query, params); } } /** * Annotate loaded files. Used only to annotate recently loaded files, after the {@link #index}. * * @param outdirUri Index output directory * @param files Indexed files * @param results StorageETLResults * @param options Other options * @throws StoragePipelineException If there is any problem related with the StorageEngine */ protected void annotateLoadedFiles(URI outdirUri, List<URI> files, List<StoragePipelineResult> results, ObjectMap options) throws StoragePipelineException { if (!files.isEmpty() && options.getBoolean(Options.ANNOTATE.key(), Options.ANNOTATE.defaultValue())) { String dbName = options.getString(Options.DB_NAME.key(), null); try (VariantDBAdaptor dbAdaptor = getDBAdaptor(dbName)) { int studyId = options.getInt(Options.STUDY_ID.key()); StudyConfiguration studyConfiguration = dbAdaptor.getStudyConfigurationManager() .getStudyConfiguration(studyId, new QueryOptions(options)).first(); List<Integer> fileIds = new ArrayList<>(); for (URI uri : files) { String fileName = VariantReaderUtils.getOriginalFromTransformedFile(uri); fileIds.add(studyConfiguration.getFileIds().get(fileName)); } Query annotationQuery = new Query(); if (!options.getBoolean(VariantAnnotationManager.OVERWRITE_ANNOTATIONS, false)) { annotationQuery.put(VariantDBAdaptor.VariantQueryParams.ANNOTATION_EXISTS.key(), false); } annotationQuery.put(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), Collections.singletonList(studyId)); // annotate just the indexed variants // annotate just the indexed variants annotationQuery.put(VariantDBAdaptor.VariantQueryParams.FILES.key(), fileIds); QueryOptions annotationOptions = new QueryOptions() .append(DefaultVariantAnnotationManager.OUT_DIR, outdirUri.getPath()) .append(DefaultVariantAnnotationManager.FILE_NAME, dbName + "." + TimeUtils.getTime()); annotate(dbName, annotationQuery, annotationOptions); } catch (RuntimeException | StorageEngineException | VariantAnnotatorException | IOException e) { throw new StoragePipelineException("Error annotating.", e, results); } } } /** * Provide a new VariantAnnotationManager for creating and loading annotations. * * @param annotator VariantAnnotator to use for creating the new annotations * @param dbAdaptor VariantDBAdaptor * @return A new instance of VariantAnnotationManager */ protected VariantAnnotationManager newVariantAnnotationManager(VariantAnnotator annotator, VariantDBAdaptor dbAdaptor) { return new DefaultVariantAnnotationManager(annotator, dbAdaptor); } /** * * @param study Study * @param cohorts Cohorts to calculate stats * @param dbName database name to annotate. * @param options Other options * {@link Options#AGGREGATION_MAPPING_PROPERTIES} * {@link Options#OVERWRITE_STATS} * {@link Options#UPDATE_STATS} * {@link Options#LOAD_THREADS} * {@link Options#LOAD_BATCH_SIZE} * {@link VariantDBAdaptor.VariantQueryParams#REGION} * * @throws StorageEngineException If there is any problem related with the StorageEngine * @throws IOException If there is any IO problem */ public void calculateStats(String study, List<String> cohorts, String dbName, QueryOptions options) throws StorageEngineException, IOException { try (VariantDBAdaptor dbAdaptor = getDBAdaptor(dbName)) { VariantStatisticsManager statisticsManager = newVariantStatisticsManager(dbAdaptor); statisticsManager.calculateStatistics(study, cohorts, options); } } /** * Calculate stats for loaded files. Used to calculate statistics for cohort ALL from recently loaded files, after the {@link #index}. * * @param output Index output directory * @param files Indexed files * @param results StorageETLResults * @param options Other options * @throws StoragePipelineException If there is any problem related with the StorageEngine */ protected void calculateStatsForLoadedFiles(URI output, List<URI> files, List<StoragePipelineResult> results, ObjectMap options) throws StoragePipelineException { if (options.getBoolean(Options.CALCULATE_STATS.key(), Options.CALCULATE_STATS.defaultValue())) { // TODO add filters String dbName = options.getString(Options.DB_NAME.key(), null); try (VariantDBAdaptor dbAdaptor = getDBAdaptor(dbName)) { logger.debug("about to calculate stats"); int studyId = options.getInt(Options.STUDY_ID.key()); QueryOptions statsOptions = new QueryOptions(options); StudyConfiguration studyConfiguration = dbAdaptor.getStudyConfigurationManager() .getStudyConfiguration(studyId, new QueryOptions()).first(); List<Integer> fileIds = new ArrayList<>(); for (URI uri : files) { String fileName = VariantReaderUtils.getOriginalFromTransformedFile(uri); fileIds.add(studyConfiguration.getFileIds().get(fileName)); } Integer defaultCohortId = studyConfiguration.getCohortIds().get(StudyEntry.DEFAULT_COHORT); if (studyConfiguration.getCalculatedStats().contains(defaultCohortId)) { logger.debug("Cohort \"{}\":{} was already calculated. Just update stats.", StudyEntry.DEFAULT_COHORT, defaultCohortId); statsOptions.append(Options.UPDATE_STATS.key(), true); } URI statsOutputUri = output.resolve(buildFilename(studyConfiguration.getStudyName(), fileIds.get(0)) + "." + TimeUtils.getTime()); statsOptions.put(DefaultVariantStatisticsManager.OUTPUT, statsOutputUri.toString()); statsOptions.remove(Options.FILE_ID.key()); List<String> cohorts = Collections.singletonList(StudyEntry.DEFAULT_COHORT); calculateStats(studyConfiguration.getStudyName(), cohorts, dbName, statsOptions); } catch (Exception e) { throw new StoragePipelineException("Can't calculate stats.", e, results); } } } /** * Provide a new VariantAnnotationManager for creating and loading annotations. * * @param dbAdaptor VariantDBAdaptor * @return A new instance of VariantAnnotationManager */ public VariantStatisticsManager newVariantStatisticsManager(VariantDBAdaptor dbAdaptor) { return new DefaultVariantStatisticsManager(dbAdaptor); } public void searchIndex(String database) throws StorageEngineException, IOException, VariantSearchException { searchIndex(database, new Query(), new QueryOptions()); } public void searchIndex(String database, Query query, QueryOptions queryOptions) throws StorageEngineException, IOException, VariantSearchException { VariantDBAdaptor dbAdaptor = getDBAdaptor(database); variantSearchManager = new VariantSearchManager(dbAdaptor.getDBAdaptorUtils(), configuration); if (configuration.getSearch().getActive() && variantSearchManager.isAlive(database)) { // first, create the collection it it does not exist if (!variantSearchManager.existCollection(database)) { // by default: config=OpenCGAConfSet, shards=1, replicas=1 logger.info("Creating Solr collection " + database); variantSearchManager.createCollection(database); } else { logger.info("Solr collection '" + database + "' exists."); } // then, load variants queryOptions = new QueryOptions(); queryOptions.put(QueryOptions.EXCLUDE, Arrays.asList(VariantField.STUDIES_SAMPLES_DATA, VariantField.STUDIES_FILES)); VariantDBIterator iterator = dbAdaptor.iterator(query, queryOptions); variantSearchManager.load(database, iterator); } dbAdaptor.close(); } /** * Drops a file from the Variant Storage. * * @param study StudyName or StudyId * @param fileId FileId * @throws StorageEngineException If the file can not be deleted or there was some problem deleting it. */ public abstract void dropFile(String study, int fileId) throws StorageEngineException; public abstract void dropStudy(String studyName) throws StorageEngineException; @Override public void testConnection() throws StorageEngineException { // ObjectMap variantOptions = configuration.getStorageEngine(storageEngineId).getVariant().getOptions(); // logger.error("Connection to database '{}' failed", variantOptions.getString(VariantStorageEngine.Options.DB_NAME.key())); // throw new StorageEngineException("Database connection test failed"); } public ObjectMap getOptions() { return configuration.getStorageEngine(storageEngineId).getVariant().getOptions(); } public VariantReaderUtils getVariantReaderUtils() { return new VariantReaderUtils(); } /** * Get the StudyConfigurationManager. * <p> * If there is no StudyConfigurationManager, try to build by dependency injection. * If can't build, call to the method "buildStudyConfigurationManager", witch could be override. * * @param options Map-like object with the options * @return A StudyConfigurationManager object * @throws StorageEngineException If object is null */ public final StudyConfigurationManager getStudyConfigurationManager(ObjectMap options) throws StorageEngineException { StudyConfigurationManager studyConfigurationManager = null; String studyConfigurationManagerClassName = null; if (options.containsKey(Options.STUDY_CONFIGURATION_MANAGER_CLASS_NAME.key())) { studyConfigurationManagerClassName = options.getString(Options.STUDY_CONFIGURATION_MANAGER_CLASS_NAME.key()); } else { if (configuration.getStudyMetadataManager() != null && !configuration.getStudyMetadataManager().isEmpty()) { studyConfigurationManagerClassName = configuration.getStudyMetadataManager(); } } if (studyConfigurationManagerClassName != null && !studyConfigurationManagerClassName.isEmpty()) { try { studyConfigurationManager = StudyConfigurationManager.build(studyConfigurationManagerClassName, options); } catch (ReflectiveOperationException e) { e.printStackTrace(); logger.error("Error creating a StudyConfigurationManager. Creating default StudyConfigurationManager", e); throw new RuntimeException(e); } } // This method can be override in children methods if (studyConfigurationManager == null) { studyConfigurationManager = buildStudyConfigurationManager(options); } return studyConfigurationManager; } /** * Build the default StudyConfigurationManager. This method could be override by children classes if they want to use other class. * * @param options Map-like object with the options * @return A StudyConfigurationManager object * @throws StorageEngineException If object is null */ protected StudyConfigurationManager buildStudyConfigurationManager(ObjectMap options) throws StorageEngineException { return new FileStudyConfigurationManager(options); } /** * @param input Input variant file (avro, json, vcf) * @param source VariantSource to fill. Can be null * @return Read VariantSource * @throws StorageEngineException if the format is not valid or there is an error reading * @deprecated use {@link VariantReaderUtils#readVariantSource(java.net.URI)} */ @Deprecated public static VariantSource readVariantSource(Path input, VariantSource source) throws StorageEngineException { return VariantReaderUtils.readVariantSource(input, source); } public static String buildFilename(String studyName, int fileId) { return VariantStoragePipeline.buildFilename(studyName, fileId); } public VariantSearchManager getVariantSearchManager() { return variantSearchManager; } public VariantStorageEngine setVariantSearchManager(VariantSearchManager variantSearchManager) { this.variantSearchManager = variantSearchManager; return this; } }