/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.core.variant.stats; import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantSource; import org.opencb.biodata.models.variant.stats.VariantStats; import org.opencb.biodata.tools.variant.stats.VariantAggregatedEVSStatsCalculator; import org.opencb.biodata.tools.variant.stats.VariantAggregatedExacStatsCalculator; import org.opencb.biodata.tools.variant.stats.VariantAggregatedStatsCalculator; import org.opencb.biodata.tools.variant.stats.VariantStatsCalculator; import java.util.*; import static org.opencb.biodata.models.variant.VariantSource.Aggregation.isAggregated; /** * Created by jmmut on 28/01/15. */ public class VariantStatisticsCalculator { private int skippedFiles; private boolean overwrite; private VariantAggregatedStatsCalculator aggregatedCalculator; private VariantSource.Aggregation aggregation; public VariantStatisticsCalculator() { this(false); } public VariantStatisticsCalculator(boolean overwrite) { this.overwrite = overwrite; skippedFiles = 0; } public int getSkippedFiles() { return skippedFiles; } public void setSkippedFiles(int skippedFiles) { this.skippedFiles = skippedFiles; } /** * if the study is aggregated i.e. it doesn't have sample data, call this before calculate. It is not needed if the * study does have samples. * * @param aggregation see org.opencb.biodata.models.variant.VariantSource.Aggregation * @param tagmap nullable, see org.opencb.biodata.tools.variant.stats.VariantAggregatedStatsCalculator() */ public void setAggregationType(VariantSource.Aggregation aggregation, Properties tagmap) { aggregatedCalculator = null; this.aggregation = aggregation; switch (this.aggregation) { case NONE: aggregatedCalculator = null; break; case BASIC: aggregatedCalculator = new VariantAggregatedStatsCalculator(tagmap); break; case EVS: aggregatedCalculator = new VariantAggregatedEVSStatsCalculator(tagmap); break; case EXAC: aggregatedCalculator = new VariantAggregatedExacStatsCalculator(tagmap); break; default: break; } } /** * Creates another map with the intersection of the parameters. * * @param allSamples Map that contains the values we want a subset of * @param samplesToKeep set of names of samples * @param <T> Type * @return variant with just the samples in 'samplesToKeep' */ public <T> Map<String, T> filterSamples(Map<String, T> allSamples, Set<String> samplesToKeep) { Map<String, T> filtered = new HashMap<>(); if (samplesToKeep != null) { for (String sampleName : allSamples.keySet()) { if (samplesToKeep.contains(sampleName)) { filtered.put(sampleName, allSamples.get(sampleName)); } } } return filtered; } /** * computes the VariantStats for each subset of samples. * * @param variants variants to to calculate stats from * @param studyId needed to choose the VariantSourceEntry in the variants * @param fileId needed to choose the VariantSourceEntry in the variants * @param samples keys are cohort names, values are sets of samples names. groups of samples (cohorts) for each to compute * VariantStats. * @return list of VariantStatsWrapper. may be shorter than the list of variants if there is no source for some variant */ public List<VariantStatsWrapper> calculateBatch(List<Variant> variants, String studyId, String fileId, Map<String, Set<String>> samples) { List<VariantStatsWrapper> variantStatsWrappers = new ArrayList<>(variants.size()); for (Variant variant : variants) { StudyEntry study = null; for (StudyEntry entry : variant.getStudies()) { if (entry.getStudyId().equals(studyId)) { study = entry; break; } } if (study == null) { skippedFiles++; continue; } // Clear any stats from the input study.setStats(new HashMap<>()); if (!isAggregated(aggregation) && samples != null) { for (Map.Entry<String, Set<String>> cohort : samples.entrySet()) { if (overwrite || study.getStats(cohort.getKey()) == null) { VariantStats variantStats = new VariantStats(variant); Map<String, String> attributes = study.getAttributes(); attributes = attributes == null ? Collections.emptyMap() : attributes; VariantStatsCalculator.calculate(study, cohort.getValue(), attributes, null, variantStats); study.setStats(cohort.getKey(), variantStats); } } } else if (aggregatedCalculator != null) { // another way to say that the study is aggregated (!VariantSource.Aggregation // .NONE.equals(aggregation)) // study.setAttributes(removeAttributePrefix(study.getAttributes())); aggregatedCalculator.calculate(variant, study); } // if (overwrite || file.stats() == null) { // VariantStats allVariantStats = new VariantStats(variant); // file.setCohortStats(VariantSourceEntry.DEFAULT_COHORT // , allVariantStats.calculate(file.getSamplesData(), file.getAttributes(), null)); // // } variantStatsWrappers.add( new VariantStatsWrapper(variant.getChromosome(), variant.getStart(), study.getStats())); } return variantStatsWrappers; } @Deprecated public static Map<String, String> removeAttributePrefix(Map<String, String> attributes) throws IllegalArgumentException { Map<String, String> newAttributes = new LinkedHashMap<>(attributes.size()); Set<String> prefixSet = new LinkedHashSet<>(); for (String key : attributes.keySet()) { String[] split = key.split("_", 2); prefixSet.add(split[0]); newAttributes.put(split[1], attributes.get(key)); } if (prefixSet.size() > 1) { throw new IllegalArgumentException("attributes should contain only one fileId prefix, and there are: " + prefixSet.toString()); } return newAttributes; } }