/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.core.variant.adaptors;
import org.opencb.biodata.models.core.Region;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.AdditionalAttribute;
import org.opencb.biodata.models.variant.avro.VariantAnnotation;
import org.opencb.cellbase.client.rest.CellBaseClient;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryParam;
import org.opencb.commons.datastore.core.QueryResult;
import org.opencb.commons.io.DataWriter;
import org.opencb.opencga.core.results.VariantQueryResult;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager;
import org.opencb.opencga.storage.core.variant.stats.VariantStatsWrapper;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import static org.opencb.commons.datastore.core.QueryParam.Type.*;
/**
* @author Ignacio Medina <igmecas@gmail.com>
* @author Jacobo Coll <jacobo167@gmail.com>
* @author Cristina Yenyxe Gonzalez Garcia <cgonzalez@cipf.es>
*/
public interface VariantDBAdaptor extends VariantIterable, AutoCloseable {
String ID_DESCR = "List of variant ids";
String REGION_DESCR = "List of regions: {chr}:{start}-{end}, e.g.: 2,3:1000000-2000000";
String CHROMOSOME_DESCR = "List of chromosomes";
String GENE_DESCR = "List of genes";
String TYPE_DESCR = "Variant type: [SNV, MNV, INDEL, SV, CNV]";
String REFERENCE_DESCR = "Reference allele";
String ALTERNATE_DESCR = "Main alternate allele";
String STUDIES_DESCR = "";
String RETURNED_STUDIES_DESCR = "List of studies to be returned";
// String SAMPLES_DESCR = "Filter variants where ALL the provided samples are mutated (not HOM_REF or missing)";
String SAMPLES_DESCR = "Filter variants where ALL the provided samples are mutated (HET or HOM_ALT)";
String GENOTYPE_DESCR = "Samples with a specific genotype: {samp_1}:{gt_1}(,{gt_n})*(;{samp_n}:{gt_1}(,{gt_n})*)*"
+ " e.g. HG0097:0/0;HG0098:0/1,1/1";
String RETURNED_SAMPLES_DESCR = "List of samples to be returned";
String SAMPLES_METADATA_DESCR =
"Returns the samples metadata group by study. Sample names will appear in the same order as their corresponding genotypes.";
String FILES_DESCR = "Select variants in specific files";
String FILTER_DESCR =
"Specify the FILTER for any of the files. If \"files\" filter is provided, will match the file and the filter."
+ " e.g.: PASS,LowGQX";
String RETURNED_FILES_DESCR = "List of files to be returned";
String COHORTS_DESCR = "Select variants with calculated stats for the selected cohorts";
String STATS_MAF_DESCR = "Minor Allele Frequency: [{study:}]{cohort}[<|>|<=|>=]{number}";
String STATS_MGF_DESCR = "Minor Genotype Frequency: [{study:}]{cohort}[<|>|<=|>=]{number}";
String MISSING_ALLELES_DESCR = "Number of missing alleles: [{study:}]{cohort}[<|>|<=|>=]{number}";
String MISSING_GENOTYPES_DESCR = "Number of missing genotypes: [{study:}]{cohort}[<|>|<=|>=]{number}";
String ANNOTATION_EXISTS_DESCR = "Specify if the variant annotation must exists.";
String ANNOT_CONSEQUENCE_TYPE_DESCR = "Consequence type SO term list. e.g. missense_variant,stop_lost or SO:0001583,SO:0001578";
String ANNOT_XREF_DESCR = "External references.";
String ANNOT_BIOTYPE_DESCR = "Biotype";
String ANNOT_POLYPHEN_DESCR = "Polyphen, protein substitution score. [<|>|<=|>=]{number} or [~=|=|]{description} e.g. <=0.9 , =benign";
String ANNOT_SIFT_DESCR = "Sift, protein substitution score. [<|>|<=|>=]{number} or [~=|=|]{description} e.g. >0.1 , ~=tolerant";
String ANNOT_PROTEIN_SUBSTITUTION_DESCR = "Protein substitution score. {protein_score}[<|>|<=|>=]{number} or"
+ " {protein_score}[~=|=]{description} e.g. polyphen>0.1 , sift=tolerant";
String ANNOT_CONSERVATION_DESCR = "Conservation score: {conservation_score}[<|>|<=|>=]{number} e.g. phastCons>0.5,phylop<0.1,gerp>0.1";
String ANNOT_POPULATION_ALTERNATE_FREQUENCY_DESCR = "Alternate Population Frequency: {study}:{population}[<|>|<=|>=]{number}";
String ANNOT_POPULATION_REFERENCE_FREQUENCY_DESCR = "Reference Population Frequency: {study}:{population}[<|>|<=|>=]{number}";
String ANNOT_POPULATION_MINOR_ALLELE_FREQUENCY_DESCR = "Population minor allele frequency: {study}:{population}[<|>|<=|>=]{number}";
String ANNOT_TRANSCRIPTION_FLAGS_DESCR = "List of transcript annotation flags. e.g. CCDS,basic,cds_end_NF,"
+ "mRNA_end_NF,cds_start_NF,mRNA_start_NF,seleno";
String ANNOT_GENE_TRAITS_ID_DESCR = "List of gene trait association id. e.g. \"umls:C0007222,OMIM:269600\"";
String ANNOT_GENE_TRAITS_NAME_DESCR = "List of gene trait association names. e.g. \"Cardiovascular Diseases\"";
String ANNOT_CLINVAR_DESCR = "List of ClinVar accessions";
String ANNOT_COSMIC_DESCR = "List of COSMIC mutation IDs.";
String ANNOT_HPO_DESCR = "List of HPO terms. e.g. \"HP:0000545\"";
String ANNOT_TRAITS_DESCR = "List of traits, based on ClinVar, HPO, COSMIC, i.e.: IDs, histologies, descriptions,...";
String ANNOT_GO_DESCR = "List of GO (Genome Ontology) terms. e.g. \"GO:0002020\"";
String ANNOT_EXPRESSION_DESCR = "List of tissues of interest. e.g. \"tongue\"";
String ANNOT_PROTEIN_KEYWORDS_DESCR = "List of protein variant annotation keywords";
String ANNOT_DRUG_DESCR = "List of drug names";
String ANNOT_FUNCTIONAL_SCORE_DESCR = "Functional score: {functional_score}[<|>|<=|>=]{number}, e.g. cadd_scaled>5.2,cadd_raw<=0.3";
String ANNOT_CUSTOM_DESCR = "Custom annotation: {key}[<|>|<=|>=]{number} or {key}[~=|=]{text}";
String UNKNOWN_GENOTYPE_DESCR = "Returned genotype for unknown genotypes. Common values: [0/0, 0|0, ./.]";
enum VariantQueryParams implements QueryParam {
ID("ids", TEXT_ARRAY, ID_DESCR),
REGION("region", TEXT_ARRAY, REGION_DESCR),
CHROMOSOME("chromosome", TEXT_ARRAY, CHROMOSOME_DESCR),
GENE("gene", TEXT_ARRAY, GENE_DESCR),
TYPE("type", TEXT_ARRAY, TYPE_DESCR),
REFERENCE("reference", TEXT_ARRAY, REFERENCE_DESCR),
ALTERNATE("alternate", TEXT_ARRAY, ALTERNATE_DESCR),
//EFFECT ("TEXT_ARRAY", null, ),
STUDIES("studies", TEXT_ARRAY, STUDIES_DESCR),
RETURNED_STUDIES("returnedStudies", TEXT_ARRAY, RETURNED_STUDIES_DESCR),
SAMPLES("samples", TEXT_ARRAY, SAMPLES_DESCR),
//[<study>:]<sample>:<genotype>[,<genotype>]*
GENOTYPE("genotype", TEXT_ARRAY, GENOTYPE_DESCR),
RETURNED_SAMPLES("returnedSamples", TEXT_ARRAY, RETURNED_SAMPLES_DESCR),
SAMPLES_METADATA("samplesMetadata", TEXT_ARRAY, SAMPLES_METADATA_DESCR),
FILES("files", TEXT_ARRAY, FILES_DESCR),
FILTER("filter", TEXT_ARRAY, FILTER_DESCR),
RETURNED_FILES("returnedFiles", TEXT_ARRAY, RETURNED_FILES_DESCR),
COHORTS("cohorts", TEXT_ARRAY, COHORTS_DESCR),
STATS_MAF("maf", TEXT_ARRAY, STATS_MAF_DESCR),
STATS_MGF("mgf", TEXT_ARRAY, STATS_MGF_DESCR),
MISSING_ALLELES("missingAlleles", TEXT_ARRAY, MISSING_ALLELES_DESCR),
MISSING_GENOTYPES("missingGenotypes", TEXT_ARRAY, MISSING_GENOTYPES_DESCR),
ANNOTATION_EXISTS("annotationExists", BOOLEAN, ANNOTATION_EXISTS_DESCR),
ANNOT_CONSEQUENCE_TYPE("annot-ct", TEXT_ARRAY, ANNOT_CONSEQUENCE_TYPE_DESCR),
ANNOT_XREF("annot-xref", TEXT_ARRAY, ANNOT_XREF_DESCR),
ANNOT_BIOTYPE("annot-biotype", TEXT_ARRAY, ANNOT_BIOTYPE_DESCR),
ANNOT_POLYPHEN("polyphen", TEXT_ARRAY, ANNOT_POLYPHEN_DESCR),
ANNOT_SIFT("sift", TEXT_ARRAY, ANNOT_SIFT_DESCR),
ANNOT_PROTEIN_SUBSTITUTION("protein_substitution", TEXT_ARRAY, ANNOT_PROTEIN_SUBSTITUTION_DESCR),
ANNOT_CONSERVATION("conservation", TEXT_ARRAY, ANNOT_CONSERVATION_DESCR),
ANNOT_POPULATION_ALTERNATE_FREQUENCY("alternate_frequency", TEXT_ARRAY, ANNOT_POPULATION_ALTERNATE_FREQUENCY_DESCR),
ANNOT_POPULATION_REFERENCE_FREQUENCY("reference_frequency", TEXT_ARRAY, ANNOT_POPULATION_REFERENCE_FREQUENCY_DESCR),
ANNOT_POPULATION_MINOR_ALLELE_FREQUENCY("annot-population-maf", TEXT_ARRAY, ANNOT_POPULATION_MINOR_ALLELE_FREQUENCY_DESCR),
ANNOT_TRANSCRIPTION_FLAGS("annot-transcription-flags", TEXT_ARRAY, ANNOT_TRANSCRIPTION_FLAGS_DESCR),
ANNOT_GENE_TRAITS_ID("annot-gene-trait-id", TEXT_ARRAY, ANNOT_GENE_TRAITS_ID_DESCR),
ANNOT_GENE_TRAITS_NAME("annot-gene-trait-name", TEXT_ARRAY, ANNOT_GENE_TRAITS_NAME_DESCR),
ANNOT_CLINVAR("clinvar", TEXT_ARRAY, ANNOT_CLINVAR_DESCR),
ANNOT_COSMIC("cosmic", TEXT_ARRAY, ANNOT_COSMIC_DESCR),
ANNOT_TRAITS("traits", TEXT_ARRAY, ANNOT_TRAITS_DESCR),
ANNOT_HPO("annot-hpo", TEXT_ARRAY, ANNOT_HPO_DESCR),
ANNOT_GO("annot-go", TEXT_ARRAY, ANNOT_GO_DESCR),
ANNOT_EXPRESSION("annot-expression", TEXT_ARRAY, ANNOT_EXPRESSION_DESCR),
ANNOT_PROTEIN_KEYWORDS("annot-protein-keywords", TEXT_ARRAY, ANNOT_PROTEIN_KEYWORDS_DESCR),
ANNOT_DRUG("annot-drug", TEXT_ARRAY, ANNOT_DRUG_DESCR),
ANNOT_FUNCTIONAL_SCORE("annot-functional-score", TEXT_ARRAY, ANNOT_FUNCTIONAL_SCORE_DESCR),
ANNOT_CUSTOM("annot-custom", TEXT_ARRAY, ANNOT_CUSTOM_DESCR),
UNKNOWN_GENOTYPE("unknownGenotype", TEXT, UNKNOWN_GENOTYPE_DESCR);
VariantQueryParams(String key, Type type, String description) {
this.key = key;
this.type = type;
this.description = description;
}
private final String key;
private Type type;
private String description;
@Override
public String key() {
return key;
}
@Override
public String description() {
return description;
}
@Override
public Type type() {
return type;
}
@Override
public String toString() {
return key() + " [" + type() + "] : " + description();
}
}
/**
* This method sets a data writer object for data serialization. When used no data will be return in
* QueryResult object but written into the writer.
*
* @param dataWriter Deprecated param
*/
@Deprecated
default void setDataWriter(DataWriter dataWriter) {}
/**
* This method inserts Variants into the given Study. If the Study already exists then it just adds the new Sample
* genotypes, also new variants are inserted. If it is a new Study then Sample genotypes are added to the new Study.
*
* @param variants List of variants in OpenCB data model to be inserted
* @param studyName Name or alias of the study
* @param options Query modifiers, accepted values are: include, exclude, limit, skip, sort and count
* @return A QueryResult with the number of inserted variants
*/
QueryResult insert(List<Variant> variants, String studyName, QueryOptions options);
/**
* Delete all the variants from the database resulting of executing the query.
*
* @param query Query to be executed in the database
* @param options Query modifiers, accepted values are: include, exclude, limit, skip, sort and count
* @return A QueryResult with the number of deleted variants
*/
QueryResult delete(Query query, QueryOptions options);
/**
* Delete all the given samples belonging to the study from the database.
*
* @param studyName The study name where samples belong to
* @param sampleNames Sample names to be deleted, these must belong to the study
* @param options Query modifiers, accepted values are: include, exclude, limit, skip, sort and count
* @return A QueryResult with a list with all the samples deleted
*/
QueryResult deleteSamples(String studyName, List<String> sampleNames, QueryOptions options);
/**
* Delete the given file from the database with all the samples it has.
*
* @param studyName The study where the file belong
* @param fileName The file name to be deleted, it must belong to the study
* @param options Query modifiers, accepted values are: include, exclude, limit, skip, sort and count
* @return A QueryResult with the file deleted
*/
QueryResult deleteFile(String studyName, String fileName, QueryOptions options);
/**
* Delete the given study from the database.
*
* @param studyName The study name to delete
* @param options Query modifiers, accepted values are: purge
* @return A QueryResult with the study deleted
*/
QueryResult deleteStudy(String studyName, QueryOptions options);
/**
* Fetch all variants resulting of executing the query in the database. Returned fields are taken from
* the 'include' and 'exclude' fields at options.
*
* @param query Query to be executed in the database to filter variants
* @param options Query modifiers, accepted values are: include, exclude, limit, skip, sort and count
* @return A QueryResult with the result of the query
*/
VariantQueryResult<Variant> get(Query query, QueryOptions options);
/**
* Fetch all variants resulting of executing all the queries in the database. Returned fields are taken from
* the 'include' and 'exclude' fields at options.
*
* @param queries List of queries to be executed in the database to filter variants
* @param options Query modifiers, accepted values are: include, exclude, limit, skip, sort and count.
* @return A list of QueryResult with the result of the queries
*/
List<VariantQueryResult<Variant>> get(List<Query> queries, QueryOptions options);
/**
* Return all the variants in the same phase set for a given sample in a given variant.
*
* @param variant The main variant. See {@link Variant#toString()}
* @param studyName Study of the sample
* @param sampleName Sample name
* @param options Other options
* @param windowsSize Windows size for searching the phased variants.
* @return A QueryResult with the result of the query
*/
VariantQueryResult<Variant> getPhased(String variant, String studyName, String sampleName, QueryOptions options, int windowsSize);
/**
* Performs a distinct operation of the given field over the returned results.
*
* @param query Query to be executed in the database to filter variants
* @return A QueryResult with the all the distinct values
*/
QueryResult<Long> count(Query query);
/**
* Performs a distinct operation of the given field over the returned results.
*
* @param query Query to be executed in the database to filter variants
* @param field Field to be distinct, it must be a valid QueryParams id
* @return A QueryResult with the all the distinct values
*/
QueryResult distinct(Query query, String field);
/**
* This methods calculates the number of variants at different equally-sized genome chunks. This can be renderer
* as a histogram of the number of variants across a genomic region.
*
* @param query Query to be executed in the database to filter variants
* @param region Region where to calculate the variant frequency
* @param regionIntervalSize Size of the interval window, by default it is adjusted to return 200 chunks
* @return Frequencies of queried variants
*/
QueryResult getFrequency(Query query, Region region, int regionIntervalSize);
/**
* This method ranks different entities with the most or the least number of variants. These entities
* can be 'gene' or 'consequence_type' among others.
*
* @param query Query to be executed in the database to filter variants
* @param field The entity to rank
* @param numResults The max number of results to return
* @param asc Whether we want the top or the bottom part of the rank
* @return A QueryResult with a list of the entities and the number of elements
*/
QueryResult rank(Query query, String field, int numResults, boolean asc);
QueryResult groupBy(Query query, String field, QueryOptions options);
QueryResult groupBy(Query query, List<String> fields, QueryOptions options);
default List<Integer> getReturnedStudies(Query query, QueryOptions options) {
return getDBAdaptorUtils().getReturnedStudies(query, options);
}
/**
* Returns all the possible samples to be returned by an specific query.
*
* @param query Query to execute
* @param options Query Options
* @return Map key: StudyId, value: list of sampleIds
*/
default Map<Integer, List<Integer>> getReturnedSamples(Query query, QueryOptions options) {
return getDBAdaptorUtils().getReturnedSamples(query, options);
}
@Deprecated
default QueryResult addStats(List<VariantStatsWrapper> variantStatsWrappers, String studyName, QueryOptions queryOptions) {
return updateStats(variantStatsWrappers, studyName, queryOptions);
}
QueryResult updateStats(List<VariantStatsWrapper> variantStatsWrappers, String studyName, QueryOptions queryOptions);
QueryResult updateStats(List<VariantStatsWrapper> variantStatsWrappers, StudyConfiguration studyConfiguration, QueryOptions options);
QueryResult deleteStats(String studyName, String cohortName, QueryOptions options);
@Deprecated
QueryResult addAnnotations(List<VariantAnnotation> variantAnnotations, QueryOptions queryOptions);
QueryResult updateAnnotations(List<VariantAnnotation> variantAnnotations, QueryOptions queryOptions);
/**
* Update custom annotation for all the variants with in a given region.
*
* @param query Region to update
* @param name Custom annotation name.
* @param attribute Custom annotation for the region
* @param options Other options
* @return Result of the insertion
*/
QueryResult updateCustomAnnotations(Query query, String name, AdditionalAttribute attribute, QueryOptions options);
@Deprecated
QueryResult deleteAnnotation(String annotationId, Query query, QueryOptions queryOptions);
VariantSourceDBAdaptor getVariantSourceDBAdaptor();
StudyConfigurationManager getStudyConfigurationManager();
void setStudyConfigurationManager(StudyConfigurationManager studyConfigurationManager);
CellBaseClient getCellBaseClient();
VariantDBAdaptorUtils getDBAdaptorUtils();
void close() throws IOException;
/**
* Given a genomic region, it retrieves a set of variants and, optionally, all the information
* about their samples, effects and statistics. These optional arguments are specified in the "options" dictionary,
* with the keys (values must be set to true): "samples", "effects" and "stats", respectively.
*
* @param options Optional arguments
* @return A QueryResult containing a set of variants and other optional information
*/
@Deprecated
default QueryResult<Variant> getAllVariants(QueryOptions options) {
throw new UnsupportedOperationException();
}
@Deprecated
default QueryResult<Variant> getVariantById(String id, QueryOptions options) {
throw new UnsupportedOperationException();
}
@Deprecated
default List<QueryResult<Variant>> getAllVariantsByIdList(List<String> idList, QueryOptions options) {
throw new UnsupportedOperationException();
}
/**
* Given a genomic region, it retrieves a set of variants and, optionally, all the information
* about their samples, effects and statistics. These optional arguments are specified in the "options" dictionary,
* with the keys (values must be set to true): "samples", "effects" and "stats", respectively.
*
* @param region The region where variants must be searched
* @param options Optional arguments
* @return A QueryResult containing a set of variants and other optional information
*/
@Deprecated
default QueryResult<Variant> getAllVariantsByRegion(Region region, QueryOptions options) {
throw new UnsupportedOperationException();
}
@Deprecated
default List<QueryResult<Variant>> getAllVariantsByRegionList(List<Region> regionList, QueryOptions options) {
throw new UnsupportedOperationException();
}
@Deprecated
default QueryResult getVariantFrequencyByRegion(Region region, QueryOptions options) {
throw new UnsupportedOperationException();
}
@Deprecated
default QueryResult groupBy(String field, QueryOptions options) {
throw new UnsupportedOperationException();
}
@Deprecated
default VariantDBIterator iterator(QueryOptions options) {
throw new UnsupportedOperationException();
}
@Deprecated
default QueryResult updateStats(List<VariantStatsWrapper> variantStatsWrappers, int studyId, QueryOptions queryOptions) {
throw new UnsupportedOperationException();
}
// @Deprecated
// QueryResult updateAnnotations(List<VariantAnnotation> variantAnnotations, QueryOptions queryOptions);
// @Deprecated
// QueryResult getAllVariantsByRegionAndStudies(Region region, List<String> studyIds, QueryOptions options);
// @Deprecated
// QueryResult getAllVariantsByGene(String geneName, QueryOptions options);
// @Deprecated
// QueryResult getMostAffectedGenes(int numGenes, QueryOptions options);
// @Deprecated
// QueryResult getLeastAffectedGenes(int numGenes, QueryOptions options);
// @Deprecated
// QueryResult getTopConsequenceTypes(int numConsequenceTypes, QueryOptions options);
// @Deprecated
// QueryResult getBottomConsequenceTypes(int numConsequenceTypes, QueryOptions options);
}