/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.core.manager.variant.operations; import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.VariantSource.Aggregation; import org.opencb.biodata.tools.variant.stats.VariantAggregatedStatsCalculator; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.datastore.core.QueryResult; import org.opencb.commons.utils.FileUtils; import org.opencb.opencga.catalog.db.api.StudyDBAdaptor; import org.opencb.opencga.catalog.exceptions.CatalogException; import org.opencb.opencga.catalog.managers.CatalogManager; import org.opencb.opencga.catalog.models.*; import org.opencb.opencga.core.common.UriUtils; import org.opencb.opencga.storage.core.StorageEngineFactory; import org.opencb.opencga.storage.core.config.StorageConfiguration; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.core.variant.VariantStorageEngine; import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor; import org.opencb.opencga.storage.core.variant.stats.DefaultVariantStatisticsManager; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.net.URISyntaxException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.stream.Collectors; import static org.apache.commons.lang3.StringUtils.isBlank; import static org.apache.commons.lang3.StringUtils.isNotEmpty; import static org.opencb.opencga.storage.core.variant.VariantStorageEngine.Options; /** * Created by jacobo on 06/03/15. */ public class VariantStatsStorageOperation extends StorageOperation { public VariantStatsStorageOperation(CatalogManager catalogManager, StorageConfiguration storageConfiguration) { super(catalogManager, StorageEngineFactory.get(storageConfiguration), LoggerFactory.getLogger(VariantStatsStorageOperation.class)); } public void calculateStats(long studyId, List<String> cohorts, String outdirStr, QueryOptions options, String sessionId) throws CatalogException, IOException, URISyntaxException, StorageEngineException { Job.Type step = Job.Type.COHORT_STATS; String fileIdStr = options.getString(Options.FILE_ID.key(), null); boolean overwriteStats = options.getBoolean(Options.OVERWRITE_STATS.key(), false); boolean updateStats = options.getBoolean(Options.UPDATE_STATS.key(), false); boolean resume = options.getBoolean(Options.RESUME.key(), Options.RESUME.defaultValue()); final Long fileId = fileIdStr == null ? null : catalogManager.getFileId(fileIdStr, Long.toString(studyId), sessionId); // Outdir must be empty URI outdirUri = UriUtils.createDirectoryUri(outdirStr); final Path outdir = Paths.get(outdirUri); outdirMustBeEmpty(outdir, options); Aggregation aggregation = getAggregation(studyId, options, sessionId); List<Long> cohortIds = checkCohorts(studyId, aggregation, cohorts, options, sessionId); Map<Long, Cohort> cohortsMap = checkCanCalculateCohorts(studyId, cohortIds, updateStats, resume, sessionId); String region = options.getString(VariantDBAdaptor.VariantQueryParams.REGION.key()); String outputFileName = buildOutputFileName(cohortIds, options, cohortsMap, region); Long catalogOutDirId = getCatalogOutdirId(studyId, options, sessionId); QueryOptions calculateStatsOptions = new QueryOptions(options) // .append(VariantStorageEngine.Options.LOAD_BATCH_SIZE.key(), 100) // .append(VariantStorageEngine.Options.LOAD_THREADS.key(), 6) .append(Options.OVERWRITE_STATS.key(), overwriteStats) .append(Options.UPDATE_STATS.key(), updateStats) .append(Options.RESUME.key(), resume); calculateStatsOptions.putIfNotNull(Options.FILE_ID.key(), fileId); calculateStatsOptions.putIfNotEmpty(VariantDBAdaptor.VariantQueryParams.REGION.key(), region); // if the study is aggregated and a mapping file is provided, pass it to storage // and create in catalog the cohorts described in the mapping file String aggregationMappingFile = options.getString(Options.AGGREGATION_MAPPING_PROPERTIES.key()); if (Aggregation.isAggregated(aggregation) && StringUtils.isNotEmpty(aggregationMappingFile)) { Properties mappingFile = readAggregationMappingFile(aggregationMappingFile); calculateStatsOptions.append(Options.AGGREGATION_MAPPING_PROPERTIES.key(), mappingFile); } DataStore dataStore = StorageOperation.getDataStore(catalogManager, studyId, File.Bioformat.VARIANT, sessionId); StudyConfiguration studyConfiguration = updateStudyConfiguration(sessionId, studyId, dataStore); Thread hook = buildHook(cohortIds, sessionId, outdir); writeJobStatus(outdir, new Job.JobStatus(Job.JobStatus.RUNNING, "Job has just started")); Runtime.getRuntime().addShutdownHook(hook); // Up to this point, catalog has not been modified try { // Modify cohort status to "CALCULATING" updateCohorts(cohortIds, sessionId, Cohort.CohortStatus.CALCULATING, "Start calculating stats"); calculateStatsOptions.put(DefaultVariantStatisticsManager.OUTPUT, outdirUri.resolve(outputFileName)); VariantStorageEngine variantStorageEngine = storageEngineFactory.getVariantStorageEngine(dataStore.getStorageEngine()); List<String> cohortsName = cohortsMap.values().stream().map(Cohort::getName).collect(Collectors.toList()); variantStorageEngine.calculateStats(studyConfiguration.getStudyName(), cohortsName, dataStore.getDbName(), calculateStatsOptions); // DefaultVariantStatisticsManager variantStatisticsManager = new DefaultVariantStatisticsManager(dbAdaptor); // // VariantDBAdaptor dbAdaptor = variantStorageManager.getDBAdaptor(dataStore.getDbName()); // Map<String, Integer> cohortNameIdMap = new HashMap<>(cohortIds.size()); // Map<String, Set<String>> cohortSamplesMap = new HashMap<>(cohortIds.size()); // for (Map.Entry<Long, Cohort> entry : cohortsMap.entrySet()) { // cohortNameIdMap.put(entry.getValue().getName(), entry.getKey().intValue()); // cohortSamplesMap.put(entry.getValue().getName(), entry.getValue().getSamples() // .stream() // .map(sampleId -> { // return studyConfiguration.getSampleIds().inverse().get(sampleId.intValue()); // }) // .collect(Collectors.toSet())); // } // URI stats = variantStatisticsManager.createStats(dbAdaptor, outdirUri.resolve(outputFileName), cohortSamplesMap, // cohortNameIdMap, studyConfiguration, calculateStatsOptions); // // writeJobStatus(outdir, new Job.JobStatus(Job.JobStatus.RUNNING, "Job still running. Statistics created.")); // variantStatisticsManager.loadStats(dbAdaptor, stats, studyConfiguration, options); if (catalogOutDirId != null) { copyResults(Paths.get(outdirUri), catalogOutDirId, sessionId); } writeJobStatus(outdir, new Job.JobStatus(Job.JobStatus.DONE, "Job completed")); // Modify cohort status to "READY" updateCohorts(cohortIds, sessionId, Cohort.CohortStatus.READY, ""); } catch (Exception e) { // Error! logger.error("Error executing stats. Set cohorts status to " + Cohort.CohortStatus.INVALID, e); writeJobStatus(outdir, new Job.JobStatus(Job.JobStatus.ERROR, "Job with error : " + e.getMessage())); // Modify to "INVALID" updateCohorts(cohortIds, sessionId, Cohort.CohortStatus.INVALID, "Error calculating stats: " + e.getMessage()); throw new StorageEngineException("Error calculating statistics.", e); } finally { // Remove hook Runtime.getRuntime().removeShutdownHook(hook); } } protected Thread buildHook(List<Long> cohortIds, String sessionId, Path outdir) { return buildHook(outdir, () -> { try { updateCohorts(cohortIds, sessionId, Cohort.CohortStatus.INVALID, ""); } catch (CatalogException e) { logger.error("Error updating cohorts " + cohortIds + " to status " + Cohort.CohortStatus.INVALID, e); } }); } protected String buildOutputFileName(List<Long> cohortIds, QueryOptions options, Map<Long, Cohort> cohortsMap, String region) { final String outputFileName; if (isNotEmpty(options.getString(DefaultVariantStatisticsManager.OUTPUT_FILE_NAME))) { outputFileName = options.getString(DefaultVariantStatisticsManager.OUTPUT_FILE_NAME); } else { StringBuilder outputFileNameBuilder; outputFileNameBuilder = new StringBuilder("stats_"); if (isNotEmpty(region)) { outputFileNameBuilder.append(region).append("_"); } for (Iterator<Long> iterator = cohortIds.iterator(); iterator.hasNext();) { Long cohortId = iterator.next(); outputFileNameBuilder.append(cohortsMap.get(cohortId).getName()); if (iterator.hasNext()) { outputFileNameBuilder.append('_'); } } outputFileName = outputFileNameBuilder.toString(); } return outputFileName; } /** * Must provide a list of cohorts or a aggregation_mapping_properties file. * @param studyId StudyId * @param aggregation Aggregation type for this study. {@link #getAggregation} * @param cohorts List of cohorts * @param options Options, where the aggregation mapping properties file will be * @param sessionId User's sessionId * @return Checked list of cohorts * @throws CatalogException if an error on Catalog * @throws IOException if an IO error reading the aggregation map file (if any) */ protected List<Long> checkCohorts(long studyId, Aggregation aggregation, List<String> cohorts, QueryOptions options, String sessionId) throws CatalogException, IOException { List<Long> cohortIds; String userId = catalogManager.getUserManager().getId(sessionId); // Check aggregation mapping properties String tagMap = options.getString(Options.AGGREGATION_MAPPING_PROPERTIES.key()); List<Long> cohortsByAggregationMapFile = Collections.emptyList(); if (!isBlank(tagMap)) { if (!Aggregation.isAggregated(aggregation)) { throw nonAggregatedWithMappingFile(); } cohortsByAggregationMapFile = createCohortsByAggregationMapFile(studyId, tagMap, sessionId); } else if (Aggregation.isAggregated(aggregation)) { if (aggregation.equals(Aggregation.BASIC)) { cohortsByAggregationMapFile = createCohortsIfNeeded(studyId, Collections.singleton(StudyEntry.DEFAULT_COHORT), sessionId); } else { throw missingAggregationMappingFile(aggregation); } } if (cohorts == null || cohorts.isEmpty()) { // If no aggregation map file provided if (cohortsByAggregationMapFile.isEmpty()) { throw missingCohorts(); } else { cohortIds = cohortsByAggregationMapFile; } } else { cohortIds = new ArrayList<>(cohorts.size()); for (String cohort : cohorts) { if (!cohort.contains(":")) { cohort = studyId + ":" + cohort; } long cohortId = catalogManager.getCohortManager().getId(userId, cohort); if (cohortId < 0) { throw new CatalogException("Cohort '" + cohort + "' not found"); } cohortIds.add(cohortId); } if (!cohortsByAggregationMapFile.isEmpty()) { if (cohortIds.size() != cohortsByAggregationMapFile.size() || !cohortIds.containsAll(cohortsByAggregationMapFile)) { throw differentCohortsThanMappingFile(); } } } return cohortIds; } private List<Long> createCohortsByAggregationMapFile(long studyId, String aggregationMapFile, String sessionId) throws IOException, CatalogException { Properties tagmap = readAggregationMappingFile(aggregationMapFile); Set<String> cohortNames = VariantAggregatedStatsCalculator.getCohorts(tagmap); return createCohortsIfNeeded(studyId, cohortNames, sessionId); } private Properties readAggregationMappingFile(String aggregationMapFile) throws IOException { Properties tagmap = new Properties(); try (InputStream is = FileUtils.newInputStream(Paths.get(aggregationMapFile))) { tagmap.load(is); } return tagmap; } private List<Long> createCohortsIfNeeded(long studyId, Set<String> cohortNames, String sessionId) throws CatalogException { List<Long> cohorts = new ArrayList<>(); Map<String, Long> catalogCohorts = catalogManager.getAllCohorts(studyId, null, new QueryOptions(QueryOptions.INCLUDE, "name,id"), sessionId).getResult() .stream() .collect(Collectors.toMap(Cohort::getName, Cohort::getId)); for (String cohortName : cohortNames) { if (!catalogCohorts.containsKey(cohortName)) { QueryResult<Cohort> cohort = catalogManager .createCohort(studyId, cohortName, Study.Type.COLLECTION, "", Collections.emptyList(), null, sessionId); logger.info("Creating cohort {}", cohortName); cohorts.add(cohort.first().getId()); } else { logger.debug("cohort {} was already created", cohortName); cohorts.add(catalogCohorts.get(cohortName)); } } return cohorts; } /** * If the study is aggregated and a mapping file is provided, pass it to * and create in catalog the cohorts described in the mapping file. * * If the study aggregation was not defined, updateStudy with the provided aggregation type * * @param studyId StudyId where calculate stats * @param options Options * @param sessionId Users sessionId * @return Effective study aggregation type * @throws CatalogException if something is wrong with catalog */ public Aggregation getAggregation(long studyId, QueryOptions options, String sessionId) throws CatalogException { QueryOptions include = new QueryOptions(QueryOptions.INCLUDE, StudyDBAdaptor.QueryParams.ATTRIBUTES.key()); Study study = catalogManager.getStudy(studyId, include, sessionId).first(); Aggregation argsAggregation = options.get(Options.AGGREGATED_TYPE.key(), Aggregation.class, Aggregation.NONE); Object studyAggregationObj = study.getAttributes().get(Options.AGGREGATED_TYPE.key()); Aggregation studyAggregation = null; if (studyAggregationObj != null) { studyAggregation = Aggregation.valueOf(studyAggregationObj.toString()); } final Aggregation aggregation; if (Aggregation.isAggregated(argsAggregation)) { if (studyAggregation != null && !studyAggregation.equals(argsAggregation)) { // FIXME: Throw an exception? logger.warn("Calculating statistics with aggregation " + argsAggregation + " instead of " + studyAggregation); } aggregation = argsAggregation; // If studyAggregation is not define, update study aggregation if (studyAggregation == null) { //update study aggregation Map<String, Aggregation> attributes = Collections.singletonMap(Options.AGGREGATED_TYPE.key(), argsAggregation); ObjectMap parameters = new ObjectMap("attributes", attributes); catalogManager.modifyStudy(studyId, parameters, sessionId); } } else { if (studyAggregation == null) { aggregation = Aggregation.NONE; } else { aggregation = studyAggregation; } } return aggregation; } /** * Check if a set of given cohorts are available to calculate statistics. * * @param studyId Study id * @param cohortIds Set of cohorts * @param updateStats Update already existing stats * @param resume Resume statistics calculation * @param sessionId User's sessionId * @return Map from cohortId to Cohort * @throws CatalogException if an error on Catalog */ protected Map<Long, Cohort> checkCanCalculateCohorts(long studyId, List<Long> cohortIds, boolean updateStats, boolean resume, String sessionId) throws CatalogException { Set<Long> studyIdSet = new HashSet<>(); Map<Long, Cohort> cohortMap = new HashMap<>(cohortIds.size()); for (Long cohortId : cohortIds) { Cohort cohort = catalogManager.getCohort(cohortId, null, sessionId).first(); long studyIdByCohortId = catalogManager.getStudyIdByCohortId(cohortId); studyIdSet.add(studyIdByCohortId); switch (cohort.getStatus().getName()) { case Cohort.CohortStatus.NONE: case Cohort.CohortStatus.INVALID: break; case Cohort.CohortStatus.READY: if (updateStats) { catalogManager.getCohortManager().setStatus(cohortId.toString(), Cohort.CohortStatus.INVALID, "", sessionId); break; } else { // If not updating the stats or resuming, can't calculate statistics for a cohort READY if (!resume) { throw unableToCalculateCohortReady(cohort); } } break; case Cohort.CohortStatus.CALCULATING: if (!resume) { throw unableToCalculateCohortCalculating(cohort); } break; default: throw new IllegalStateException("Unknown status " + cohort.getStatus().getName()); } cohortMap.put(cohortId, cohort); // QueryResult<Sample> sampleQueryResult = catalogManager.getAllSamples(studyIdByCohortId, new Query("id", cohort.getSamples()), // new QueryOptions(), sessionId); } // Check that all cohorts are from the same study if (studyIdSet.size() != 1) { throw new CatalogException("Error: CohortIds are from multiple studies: " + studyIdSet.toString()); } if (!new ArrayList<>(studyIdSet).get(0).equals(studyId)) { throw new CatalogException("Error: CohortIds are from a different study than provided: " + studyIdSet.toString()); } return cohortMap; } protected void updateCohorts(List<Long> cohortIds, String sessionId, String status, String message) throws CatalogException { for (Long cohortId : cohortIds) { catalogManager.getCohortManager().setStatus(cohortId.toString(), status, message, sessionId); } } static CatalogException differentCohortsThanMappingFile() throws CatalogException { return new CatalogException("Given cohorts (if any) must match with cohorts in the aggregation mapping file."); } static CatalogException missingCohorts() throws CatalogException { return new CatalogException("Cohort list null or empty"); } static IllegalArgumentException missingAggregationMappingFile(Aggregation aggregation) { return new IllegalArgumentException("Unable to calculate statistics for an aggregated study of type " + "\"" + aggregation + "\" without an aggregation mapping file."); } static IllegalArgumentException nonAggregatedWithMappingFile() { return new IllegalArgumentException("Unable to use an aggregation mapping file for non aggregated study"); } static CatalogException unableToCalculateCohortReady(Cohort cohort) { return new CatalogException("Unable to calculate stats for cohort " + "{ id: " + cohort.getId() + " name: \"" + cohort.getName() + "\" }" + " with status \"" + cohort.getStatus().getName() + "\". " + "Resume or update stats for continue calculation"); } static CatalogException unableToCalculateCohortCalculating(Cohort cohort) { return new CatalogException("Unable to calculate stats for cohort " + "{ id: " + cohort.getId() + " name: \"" + cohort.getName() + "\" }" + " with status \"" + cohort.getStatus().getName() + "\". " + "Resume for continue calculation."); } }