/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.core.variant.stats; import com.fasterxml.jackson.core.JsonFactory; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectWriter; import org.apache.avro.generic.GenericRecord; import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.stats.VariantSourceStats; import org.opencb.biodata.models.variant.stats.VariantStats; import org.opencb.biodata.tools.variant.stats.VariantAggregatedStatsCalculator; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.io.DataReader; import org.opencb.commons.run.ParallelTaskRunner; import org.opencb.opencga.core.common.ProgressLogger; import org.opencb.opencga.core.common.UriUtils; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.io.plain.StringDataWriter; import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager; import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor; import org.opencb.opencga.storage.core.variant.adaptors.VariantField; import org.opencb.opencga.storage.core.variant.io.db.VariantDBReader; import org.opencb.opencga.storage.core.variant.io.db.VariantStatsDBWriter; import org.opencb.opencga.storage.core.io.json.JsonDataReader; import org.opencb.opencga.storage.core.variant.io.json.mixin.GenericRecordAvroJsonMixin; import org.opencb.opencga.storage.core.variant.io.json.mixin.VariantStatsJsonMixin; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.net.URI; import java.net.URISyntaxException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.ExecutionException; import java.util.stream.Collectors; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import static org.opencb.biodata.models.variant.VariantSource.Aggregation.isAggregated; import static org.opencb.opencga.storage.core.variant.VariantStoragePipeline.checkStudyConfiguration; import static org.opencb.opencga.storage.core.variant.VariantStorageEngine.Options; /** * Created by jmmut on 12/02/15. */ public class DefaultVariantStatisticsManager implements VariantStatisticsManager{ public static final String OUTPUT_FILE_NAME = "output.file.name"; public static final String OUTPUT = "output"; public static final String STATS_LOAD_PARALLEL = "stats.load.parallel"; public static final boolean DEFAULT_STATS_LOAD_PARALLEL = true; private static final String VARIANT_STATS_SUFFIX = ".variants.stats.json.gz"; private static final String SOURCE_STATS_SUFFIX = ".source.stats.json.gz"; private final JsonFactory jsonFactory; private final ObjectMapper jsonObjectMapper; private final VariantDBAdaptor dbAdaptor; protected static Logger logger = LoggerFactory.getLogger(DefaultVariantStatisticsManager.class); public DefaultVariantStatisticsManager(VariantDBAdaptor dbAdaptor) { this.dbAdaptor = dbAdaptor; jsonFactory = new JsonFactory(); jsonObjectMapper = new ObjectMapper(jsonFactory); jsonObjectMapper.addMixIn(VariantStats.class, VariantStatsJsonMixin.class); jsonObjectMapper.addMixIn(GenericRecord.class, GenericRecordAvroJsonMixin.class); } @Override public void calculateStatistics(String study, List<String> cohorts, QueryOptions options) throws IOException, StorageEngineException { URI output; try { output = UriUtils.createUri(options.getString(OUTPUT)); } catch (URISyntaxException e) { throw new IllegalArgumentException(e); } URI stats = createStats(dbAdaptor, output, study, cohorts, options); loadStats(dbAdaptor, stats, study, options); } private OutputStream getOutputStream(Path filePath, QueryOptions options) throws IOException { OutputStream outputStream = new FileOutputStream(filePath.toFile()); logger.info("will write stats to {}", filePath); if (filePath.toString().endsWith(".gz")) { outputStream = new GZIPOutputStream(outputStream); } return outputStream; } /** * Gets iterator from OpenCGA Variant database. **/ private Iterator<Variant> obtainIterator(VariantDBAdaptor variantDBAdaptor, Query query, QueryOptions options) { Query iteratorQuery = query != null ? query : new Query(); //Parse query options QueryOptions iteratorQueryOptions = options != null ? options : new QueryOptions(); // TODO rethink this way to refer to the Variant fields (through DBObjectToVariantConverter) List<String> include = Arrays.asList("chromosome", "start", "end", "alternate", "reference", "sourceEntries"); iteratorQueryOptions.add("include", include); return variantDBAdaptor.iterator(iteratorQuery, iteratorQueryOptions); } public URI createStats(VariantDBAdaptor variantDBAdaptor, URI output, String study, List<String> cohorts, QueryOptions options) throws IOException, StorageEngineException { StudyConfigurationManager studyConfigurationManager = variantDBAdaptor.getStudyConfigurationManager(); StudyConfiguration studyConfiguration = studyConfigurationManager.getStudyConfiguration(study, options).first(); Map<String, Set<String>> cohortsMap = new HashMap<>(cohorts.size()); for (String cohort : cohorts) { cohortsMap.put(cohort, Collections.emptySet()); } return createStats(variantDBAdaptor, output, cohortsMap, null, studyConfiguration, options); } /** * retrieves batches of Variants, delegates to obtain VariantStatsWrappers from those Variants, and writes them to the output URI. * <p> * steps: * <p> * * gets options like batchsize, overwrite, tagMap... if(options == null) throws * * if no cohorts provided and the study is aggregated, uses the cohorts in the tagMap if any * * add the cohorts to the studyConfiuration * * checks invalidated stats, and set overwrite=true if needed * * sets up a ParallelTaskRunner: a reader, a writer and tasks * * writes the source stats * * @param variantDBAdaptor to obtain the Variants * @param output where to write the VariantStats * @param cohorts cohorts (subsets) of the samples. key: cohort name, defaultValue: list of sample names. * @param cohortIds Cohort ID * @param studyConfiguration Study configuration object * @param options (mandatory) fileId, (optional) filters to the query, batch size, number of threads to use... * @return outputUri prefix for the file names (without the "._type_.stats.json.gz") * @throws IOException If any error occurs * @throws StorageEngineException If any error occurs */ public URI createStats(VariantDBAdaptor variantDBAdaptor, URI output, Map<String, Set<String>> cohorts, Map<String, Integer> cohortIds, StudyConfiguration studyConfiguration, QueryOptions options) throws IOException, StorageEngineException { // String fileId; if (options == null) { options = new QueryOptions(); } //Parse query options int batchSize = options.getInt(Options.LOAD_BATCH_SIZE.key(), 100); // future optimization, threads, etc int numTasks = options.getInt(Options.LOAD_THREADS.key(), 6); boolean overwrite = options.getBoolean(Options.OVERWRITE_STATS.key(), false); boolean updateStats = options.getBoolean(Options.UPDATE_STATS.key(), false); Properties tagmap = options.get(Options.AGGREGATION_MAPPING_PROPERTIES.key(), Properties.class, null); // fileId = options.getString(VariantStorageEngine.Options.FILE_ID.key()); // if no cohorts provided and the study is aggregated: try to get the cohorts from the tagMap if (cohorts == null || isAggregated(studyConfiguration.getAggregation()) && tagmap != null) { if (isAggregated(studyConfiguration.getAggregation()) && tagmap != null) { cohorts = new LinkedHashMap<>(); for (String c : VariantAggregatedStatsCalculator.getCohorts(tagmap)) { cohorts.put(c, Collections.emptySet()); } } else { cohorts = new LinkedHashMap<>(); } } checkAndUpdateStudyConfigurationCohorts(studyConfiguration, cohorts, cohortIds, overwrite, updateStats); if (!overwrite) { for (String cohortName : cohorts.keySet()) { Integer cohortId = studyConfiguration.getCohortIds().get(cohortName); if (studyConfiguration.getInvalidStats().contains(cohortId)) { logger.debug("Cohort \"{}\":{} is invalid. Need to overwrite stats. Using overwrite = true", cohortName, cohortId); overwrite = true; } } } checkStudyConfiguration(studyConfiguration); VariantSourceStats variantSourceStats = new VariantSourceStats(null/*FILE_ID*/, Integer.toString(studyConfiguration.getStudyId())); // reader, tasks and writer Query readerQuery = new Query(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), studyConfiguration.getStudyId()) .append(VariantDBAdaptor.VariantQueryParams.RETURNED_STUDIES.key(), studyConfiguration.getStudyId()); if (options.containsKey(Options.FILE_ID.key())) { readerQuery.append(VariantDBAdaptor.VariantQueryParams.FILES.key(), options.get(Options.FILE_ID.key())); } if (options.containsKey(VariantDBAdaptor.VariantQueryParams.REGION.key())) { Object region = options.get(VariantDBAdaptor.VariantQueryParams.REGION.key()); readerQuery.put(VariantDBAdaptor.VariantQueryParams.REGION.key(), region); } if (updateStats) { //Get all variants that not contain any of the required cohorts readerQuery.append(VariantDBAdaptor.VariantQueryParams.COHORTS.key(), cohorts.keySet().stream().map((cohort) -> "!" + studyConfiguration.getStudyName() + ":" + cohort).collect(Collectors .joining(";"))); } logger.info("ReaderQuery: " + readerQuery.toJson()); QueryOptions readerOptions = new QueryOptions(QueryOptions.SORT, true) .append(QueryOptions.EXCLUDE, VariantField.ANNOTATION); logger.info("ReaderQueryOptions: " + readerOptions.toJson()); VariantDBReader reader = new VariantDBReader(studyConfiguration, variantDBAdaptor, readerQuery, readerOptions); List<ParallelTaskRunner.Task<Variant, String>> tasks = new ArrayList<>(numTasks); ProgressLogger progressLogger = new ProgressLogger("Calculated stats:", () -> variantDBAdaptor.count(readerQuery).first(), 200).setBatchSize(5000); for (int i = 0; i < numTasks; i++) { tasks.add(new VariantStatsWrapperTask(overwrite, cohorts, studyConfiguration, variantSourceStats, tagmap, progressLogger)); } Path variantStatsPath = Paths.get(output.getPath() + VARIANT_STATS_SUFFIX); logger.info("will write stats to {}", variantStatsPath); StringDataWriter writer = new StringDataWriter(variantStatsPath, true); // runner ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder().setNumTasks(numTasks).setBatchSize(batchSize).build(); ParallelTaskRunner runner = new ParallelTaskRunner<>(reader, tasks, writer, config); try { logger.info("starting stats creation for cohorts {}", cohorts.keySet()); long start = System.currentTimeMillis(); runner.run(); logger.info("finishing stats creation, time: {}ms", System.currentTimeMillis() - start); } catch (ExecutionException e) { throw new StorageEngineException("Unable to calculate statistics.", e); } // source stats Path fileSourcePath = Paths.get(output.getPath() + SOURCE_STATS_SUFFIX); try (OutputStream outputSourceStream = getOutputStream(fileSourcePath, options)) { ObjectWriter sourceWriter = jsonObjectMapper.writerFor(VariantSourceStats.class); outputSourceStream.write(sourceWriter.writeValueAsBytes(variantSourceStats)); } variantDBAdaptor.getStudyConfigurationManager().updateStudyConfiguration(studyConfiguration, options); return output; } class VariantStatsWrapperTask implements ParallelTaskRunner.Task<Variant, String> { private boolean overwrite; private Map<String, Set<String>> cohorts; private StudyConfiguration studyConfiguration; private final ProgressLogger progressLogger; // private String fileId; private ObjectMapper jsonObjectMapper; private ObjectWriter variantsWriter; private VariantSourceStats variantSourceStats; private Properties tagmap; private VariantStatisticsCalculator variantStatisticsCalculator; VariantStatsWrapperTask(boolean overwrite, Map<String, Set<String>> cohorts, StudyConfiguration studyConfiguration, VariantSourceStats variantSourceStats, Properties tagmap, ProgressLogger progressLogger) { this.overwrite = overwrite; this.cohorts = cohorts; this.studyConfiguration = studyConfiguration; this.progressLogger = progressLogger; jsonObjectMapper = new ObjectMapper(new JsonFactory()); jsonObjectMapper.addMixIn(VariantStats.class, VariantStatsJsonMixin.class); variantsWriter = jsonObjectMapper.writerFor(VariantStatsWrapper.class); this.variantSourceStats = variantSourceStats; this.tagmap = tagmap; variantStatisticsCalculator = new VariantStatisticsCalculator(overwrite); variantStatisticsCalculator.setAggregationType(studyConfiguration.getAggregation(), tagmap); } @Override public List<String> apply(List<Variant> variants) { List<String> strings = new ArrayList<>(variants.size()); boolean defaultCohortAbsent = false; List<VariantStatsWrapper> variantStatsWrappers = variantStatisticsCalculator.calculateBatch(variants, studyConfiguration.getStudyName(), null/*fileId*/, cohorts); long start = System.currentTimeMillis(); for (VariantStatsWrapper variantStatsWrapper : variantStatsWrappers) { try { strings.add(variantsWriter.writeValueAsString(variantStatsWrapper)); if (variantStatsWrapper.getCohortStats().get(StudyEntry.DEFAULT_COHORT) == null) { defaultCohortAbsent = true; } } catch (JsonProcessingException e) { e.printStackTrace(); } } // we don't want to overwrite file stats regarding all samples with stats about a subset of samples. Maybe if we change // VariantSource.stats to a map with every subset... if (!defaultCohortAbsent) { synchronized (variantSourceStats) { variantSourceStats.updateFileStats(variants); variantSourceStats.updateSampleStats(variants, null); // TODO test } } logger.debug("another batch of {} elements calculated. time: {}ms", strings.size(), System.currentTimeMillis() - start); if (!variants.isEmpty()) { progressLogger.increment(variants.size(), () -> ", up to position " + variants.get(variants.size() - 1).getChromosome() + ":" + variants.get(variants.size() - 1).getStart()); // logger.info("stats created up to position {}:{}", variants.get(variants.size() - 1).getChromosome(), // variants.get(variants.size() - 1).getStart()); } else { logger.info("task with empty batch"); } return strings; } @Override public void post() { if (variantStatisticsCalculator.getSkippedFiles() > 0) { logger.warn("Non calculated variant stats: " + variantStatisticsCalculator.getSkippedFiles()); } } } public void loadStats(VariantDBAdaptor variantDBAdaptor, URI uri, String study, QueryOptions options) throws IOException, StorageEngineException { StudyConfigurationManager studyConfigurationManager = variantDBAdaptor.getStudyConfigurationManager(); StudyConfiguration studyConfiguration = studyConfigurationManager.getStudyConfiguration(study, options).first(); loadStats(variantDBAdaptor, uri, studyConfiguration, options); } public void loadStats(VariantDBAdaptor variantDBAdaptor, URI uri, StudyConfiguration studyConfiguration, QueryOptions options) throws IOException, StorageEngineException { URI variantStatsUri = Paths.get(uri.getPath() + VARIANT_STATS_SUFFIX).toUri(); URI sourceStatsUri = Paths.get(uri.getPath() + SOURCE_STATS_SUFFIX).toUri(); boolean updateStats = options.getBoolean(Options.UPDATE_STATS.key(), false); checkAndUpdateCalculatedCohorts(studyConfiguration, variantStatsUri, updateStats); logger.info("starting stats loading from {} and {}", variantStatsUri, sourceStatsUri); long start = System.currentTimeMillis(); loadVariantStats(variantStatsUri, studyConfiguration, options); loadSourceStats(variantDBAdaptor, sourceStatsUri, studyConfiguration, options); logger.info("finishing stats loading, time: {}ms", System.currentTimeMillis() - start); variantDBAdaptor.getStudyConfigurationManager().updateStudyConfiguration(studyConfiguration, options); } public void loadVariantStats(URI uri, StudyConfiguration studyConfiguration, QueryOptions options) throws IOException, StorageEngineException { /* Open input streams */ Path variantInput = Paths.get(uri.getPath()); InputStream variantInputStream; variantInputStream = new FileInputStream(variantInput.toFile()); variantInputStream = new GZIPInputStream(variantInputStream); ProgressLogger progressLogger = new ProgressLogger("Loaded stats:"); ParallelTaskRunner<VariantStatsWrapper, ?> ptr; DataReader<VariantStatsWrapper> dataReader = newVariantStatsWrapperDataReader(variantInputStream); List<VariantStatsDBWriter> writers = new ArrayList<>(); if (options.getBoolean(STATS_LOAD_PARALLEL, DEFAULT_STATS_LOAD_PARALLEL)) { ptr = new ParallelTaskRunner<>( dataReader, () -> { VariantStatsDBWriter dbWriter = newVariantStatisticsDBWriter(dbAdaptor, studyConfiguration, options); dbWriter.pre(); dbWriter.setProgressLogger(progressLogger); writers.add(dbWriter); return (batch -> { dbWriter.write(batch); return Collections.emptyList(); }); }, null, ParallelTaskRunner.Config.builder().setAbortOnFail(true) .setBatchSize(options.getInt(Options.LOAD_BATCH_SIZE.key(), Options.LOAD_BATCH_SIZE.defaultValue())) .setNumTasks(options.getInt(Options.LOAD_THREADS.key(), Options.LOAD_THREADS.defaultValue())).build() ); } else { VariantStatsDBWriter dbWriter = newVariantStatisticsDBWriter(dbAdaptor, studyConfiguration, options); dbWriter.setProgressLogger(progressLogger); writers.add(dbWriter); ptr = new ParallelTaskRunner<>( dataReader, batch -> batch, dbWriter, ParallelTaskRunner.Config.builder().setAbortOnFail(true) .setBatchSize(options.getInt(Options.LOAD_BATCH_SIZE.key(), Options.LOAD_BATCH_SIZE.defaultValue())) .setNumTasks(options.getInt(Options.LOAD_THREADS.key(), Options.LOAD_THREADS.defaultValue())).build() ); } try { ptr.run(); } catch (ExecutionException e) { throw new StorageEngineException("Error loading stats", e); } Long writes = writers.stream().map(VariantStatsDBWriter::getNumWrites).reduce((a, b) -> a + b).orElse(0L); Long variantStats = writers.stream().map(VariantStatsDBWriter::getVariantStats).reduce((a, b) -> a + b).orElse(0L); if (writes < variantStats) { logger.warn("provided statistics of {} variants, but only {} were updated", variantStats, writes); logger.info("note: maybe those variants didn't had the proper study? maybe the new and the old stats were the same?"); } } protected DataReader<VariantStatsWrapper> newVariantStatsWrapperDataReader(InputStream inputStream) { JsonDataReader<VariantStatsWrapper> reader = new JsonDataReader<>(VariantStatsWrapper.class, inputStream); reader.addMixIn(VariantStats.class, VariantStatsJsonMixin.class); reader.addMixIn(GenericRecord.class, GenericRecordAvroJsonMixin.class); return reader; } protected VariantStatsDBWriter newVariantStatisticsDBWriter(VariantDBAdaptor dbAdaptor, StudyConfiguration studyConfiguration, QueryOptions options) { return new VariantStatsDBWriter(dbAdaptor, studyConfiguration, options); } public void loadSourceStats(VariantDBAdaptor variantDBAdaptor, URI uri, String study, QueryOptions options) throws IOException { StudyConfigurationManager studyConfigurationManager = variantDBAdaptor.getStudyConfigurationManager(); StudyConfiguration studyConfiguration = studyConfigurationManager.getStudyConfiguration(study, options).first(); loadSourceStats(variantDBAdaptor, uri, studyConfiguration, options); } public void loadSourceStats(VariantDBAdaptor variantDBAdaptor, URI uri, StudyConfiguration studyConfiguration, QueryOptions options) throws IOException { /* select input path */ Path sourceInput = Paths.get(uri.getPath()); /* Open input stream and initialize Json parse */ VariantSourceStats variantSourceStats; try (InputStream sourceInputStream = new GZIPInputStream(new FileInputStream(sourceInput.toFile())); JsonParser sourceParser = jsonFactory.createParser(sourceInputStream)) { variantSourceStats = sourceParser.readValueAs(VariantSourceStats.class); } // TODO if variantSourceStats doesn't have studyId and fileId, create another with variantSource.getStudyId() and variantSource // .getFileId() variantDBAdaptor.getVariantSourceDBAdaptor().updateSourceStats(variantSourceStats, studyConfiguration, options); } /* * Check that all SampleIds are in the StudyConfiguration. * <p> * If some cohort does not have samples, reads the content from StudyConfiguration. * If there is no cohortId for come cohort, reads the content from StudyConfiguration or auto-generate a cohortId * If some cohort has a different number of samples, check if this cohort is invalid. * <p> * Do not update the "calculatedStats" array. Just check that the provided cohorts are not calculated or invalid. * <p> * new requirements: * * an empty cohort is not an error if the study is aggregated * * there may be several empty cohorts, not just the ALL, because there may be several aggregated files with different sets of * hidden samples. * * if a cohort is already calculated, it is not an error if overwrite was provided * */ List<Integer> checkAndUpdateStudyConfigurationCohorts(StudyConfiguration studyConfiguration, Map<String, Set<String>> cohorts, Map<String, Integer> cohortIds, boolean overwrite, boolean updateStats) throws StorageEngineException { List<Integer> cohortIdList = new ArrayList<>(); for (Map.Entry<String, Set<String>> entry : cohorts.entrySet()) { String cohortName = entry.getKey(); Set<String> samples = entry.getValue(); final int cohortId; // get a valid cohortId if (cohortIds == null || cohortIds.isEmpty()) { if (studyConfiguration.getCohortIds().containsKey(cohortName)) { cohortId = studyConfiguration.getCohortIds().get(cohortName); } else { //Auto-generate cohortId. Max CohortId + 1 // if there are no cohorts and we are creating the first as 0 cohortId = studyConfiguration.getCohortIds().isEmpty() ? 0 : Collections.max(studyConfiguration.getCohortIds().values()) + 1; } } else { if (!cohortIds.containsKey(cohortName)) { //ERROR Missing cohortId throw new StorageEngineException("Missing cohortId for the cohort : " + cohortName); } cohortId = cohortIds.get(entry.getKey()); } // check that the cohortId-cohortName is consistent with StudyConfiguration if (studyConfiguration.getCohortIds().containsKey(cohortName)) { if (!studyConfiguration.getCohortIds().get(cohortName).equals(cohortId)) { //ERROR Duplicated cohortName throw new StorageEngineException("Duplicated cohortName " + cohortName + ":" + cohortId + ". Appears in the StudyConfiguration as " + cohortName + ":" + studyConfiguration.getCohortIds().get(cohortName)); } } else if (studyConfiguration.getCohortIds().containsValue(cohortId)) { //ERROR Duplicated cohortId throw new StorageEngineException("Duplicated cohortId " + cohortName + ":" + cohortId + ". Appears in the StudyConfiguration as " + StudyConfiguration.inverseMap(studyConfiguration.getCohortIds()).get(cohortId) + ":" + cohortId); } final Set<Integer> sampleIds; if (samples == null || samples.isEmpty()) { //There are not provided samples for this cohort. Take samples from StudyConfiguration if (isAggregated(studyConfiguration.getAggregation())) { samples = Collections.emptySet(); sampleIds = Collections.emptySet(); } else { sampleIds = studyConfiguration.getCohorts().get(cohortId); if (sampleIds == null || sampleIds.isEmpty()) { // if (sampleIds == null || (sampleIds.isEmpty() // && VariantSource.Aggregation.NONE.equals(studyConfiguration.getAggregation()))) { //ERROR: StudyConfiguration does not have samples for this cohort, and it is not an aggregated study throw new StorageEngineException("Cohort \"" + cohortName + "\" is empty"); } samples = new HashSet<>(); Map<Integer, String> idSamples = StudyConfiguration.inverseMap(studyConfiguration.getSampleIds()); for (Integer sampleId : sampleIds) { samples.add(idSamples.get(sampleId)); } } cohorts.put(cohortName, samples); } else { sampleIds = new HashSet<>(samples.size()); for (String sample : samples) { if (!studyConfiguration.getSampleIds().containsKey(sample)) { //ERROR Sample not found throw new StorageEngineException("Sample " + sample + " not found in the StudyConfiguration"); } else { sampleIds.add(studyConfiguration.getSampleIds().get(sample)); } } if (sampleIds.size() != samples.size()) { throw new StorageEngineException("Duplicated samples in cohort " + cohortName + ":" + cohortId); } if (studyConfiguration.getCohorts().get(cohortId) != null && !sampleIds.equals(studyConfiguration.getCohorts().get(cohortId))) { if (!studyConfiguration.getInvalidStats().contains(cohortId) && studyConfiguration.getCalculatedStats().contains(cohortId)) { //If provided samples are different than the stored in the StudyConfiguration, and the cohort was not invalid. throw new StorageEngineException("Different samples in cohort " + cohortName + ":" + cohortId + ". " + "Samples in the StudyConfiguration: " + studyConfiguration.getCohorts().get(cohortId).size() + ". " + "Samples provided " + samples.size() + ". Invalidate stats to continue."); } } } // if (studyConfiguration.getInvalidStats().contains(cohortId)) { // throw new IOException("Cohort \"" + cohortName + "\" stats already calculated and INVALID"); // } if (studyConfiguration.getCalculatedStats().contains(cohortId)) { if (overwrite) { studyConfiguration.getCalculatedStats().remove(cohortId); studyConfiguration.getInvalidStats().add(cohortId); } else if (updateStats) { logger.debug("Cohort \"" + cohortName + "\" stats already calculated. Calculate only for missing positions"); } else { throw new StorageEngineException("Cohort \"" + cohortName + "\" stats already calculated"); } } cohortIdList.add(cohortId); studyConfiguration.getCohortIds().put(cohortName, cohortId); studyConfiguration.getCohorts().put(cohortId, sampleIds); } return cohortIdList; } void checkAndUpdateCalculatedCohorts(StudyConfiguration studyConfiguration, URI uri, boolean updateStats) throws IOException { /** Select input path **/ Path variantInput = Paths.get(uri.getPath()); /** Open input streams and Initialize Json parse **/ try (InputStream variantInputStream = new GZIPInputStream(new FileInputStream(variantInput.toFile())); JsonParser parser = jsonFactory.createParser(variantInputStream)) { if (parser.nextToken() != null) { VariantStatsWrapper variantStatsWrapper = parser.readValueAs(VariantStatsWrapper.class); Set<String> cohortNames = variantStatsWrapper.getCohortStats().keySet(); checkAndUpdateCalculatedCohorts(studyConfiguration, cohortNames, updateStats); } else { throw new IOException("File " + uri + " is empty"); } } } void checkAndUpdateCalculatedCohorts(StudyConfiguration studyConfiguration, Collection<String> cohorts, boolean updateStats) throws IOException { for (String cohortName : cohorts) { // if (cohortName.equals(VariantSourceEntry.DEFAULT_COHORT)) { // continue; // } Integer cohortId = studyConfiguration.getCohortIds().get(cohortName); if (studyConfiguration.getInvalidStats().contains(cohortId)) { // throw new IOException("Cohort \"" + cohortName + "\" stats already calculated and INVALID"); logger.debug("Cohort \"" + cohortName + "\" stats calculated and INVALID. Set as calculated"); studyConfiguration.getInvalidStats().remove(cohortId); } if (studyConfiguration.getCalculatedStats().contains(cohortId)) { if (!updateStats) { throw new IOException("Cohort \"" + cohortName + "\" stats already calculated"); } } else { studyConfiguration.getCalculatedStats().add(cohortId); } } } }