/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.app.cli.client.executors; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.datastore.core.QueryParam; import org.opencb.commons.utils.FileUtils; import org.opencb.opencga.storage.app.cli.client.options.StorageVariantCommandOptions; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.stream.Collectors; import static org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor.VariantQueryParams.*; /** * Created by imedina on 30/12/15. */ public class VariantQueryCommandUtils { private static Logger logger = LoggerFactory.getLogger("org.opencb.opencga.storage.app.cli.client.VariantQueryCommandUtils"); public static Query parseBasicVariantQuery(StorageVariantCommandOptions.BasicVariantQueryOptions options, Query query) throws Exception { /* * Parse Variant parameters */ if (options.region != null && !options.region.isEmpty()) { query.put(REGION.key(), options.region); } else if (options.regionFile != null && !options.regionFile.isEmpty()) { Path gffPath = Paths.get(options.regionFile); FileUtils.checkFile(gffPath); String regionsFromFile = Files.readAllLines(gffPath).stream().map(line -> { String[] array = line.split("\t"); return new String(array[0].replace("chr", "") + ":" + array[3] + "-" + array[4]); }).collect(Collectors.joining(",")); query.put(REGION.key(), regionsFromFile); } addParam(query, ID, options.id); addParam(query, GENE, options.gene); addParam(query, TYPE, options.type); /** * Annotation parameters */ addParam(query, ANNOT_CONSEQUENCE_TYPE, options.consequenceType); addParam(query, ANNOT_POPULATION_ALTERNATE_FREQUENCY, options.populationFreqs); addParam(query, ANNOT_CONSERVATION, options.conservation); addParam(query, ANNOT_FUNCTIONAL_SCORE, options.functionalScore); addParam(query, ANNOT_PROTEIN_SUBSTITUTION, options.proteinSubstitution); /* * Stats parameters */ // if (options.stats != null && !options.stats.isEmpty()) { // Set<String> acceptedStatKeys = new HashSet<>(Arrays.asList(STATS_MAF.key(), // STATS_MGF.key(), // MISSING_ALLELES.key(), // MISSING_GENOTYPES.key())); // // for (String stat : options.stats.split(",")) { // int index = stat.indexOf("<"); // index = index >= 0 ? index : stat.indexOf("!"); // index = index >= 0 ? index : stat.indexOf("~"); // index = index >= 0 ? index : stat.indexOf("<"); // index = index >= 0 ? index : stat.indexOf(">"); // index = index >= 0 ? index : stat.indexOf("="); // if (index < 0) { // throw new UnsupportedOperationException("Unknown stat filter operation: " + stat); // } // String name = stat.substring(0, index); // String cond = stat.substring(index); // // if (acceptedStatKeys.contains(name)) { // query.put(name, cond); // } else { // throw new UnsupportedOperationException("Unknown stat filter name: " + name); // } // logger.info("Parsed stat filter: {} {}", name, cond); // } // } addParam(query, STATS_MAF, options.maf); return query; } public static Query parseQuery(StorageVariantCommandOptions.GenericVariantSearchOptions options, Query query) throws Exception { query = parseBasicVariantQuery(options, query); addParam(query, ANNOT_CLINVAR, options.clinvar); addParam(query, ANNOT_COSMIC, options.cosmic); return query; } public static Query parseQuery(StorageVariantCommandOptions.VariantQueryCommandOptions queryVariantsOptions, List<String> studyNames) throws Exception { VariantWriterFactory.VariantOutputFormat of = VariantWriterFactory.toOutputFormat(queryVariantsOptions.outputFormat, null); return parseGenericVariantQuery(queryVariantsOptions, queryVariantsOptions.study, studyNames, queryVariantsOptions.commonQueryOptions.count, of); } protected static Query parseGenericVariantQuery(StorageVariantCommandOptions.GenericVariantQueryOptions queryVariantsOptions, String studiesFilter, Collection<String> allStudyNames, boolean count, VariantWriterFactory.VariantOutputFormat of) throws Exception { Query query = new Query(); /* * Parse Variant parameters */ if (queryVariantsOptions.region != null && !queryVariantsOptions.region.isEmpty()) { query.put(REGION.key(), queryVariantsOptions.region); } else if (queryVariantsOptions.regionFile != null && !queryVariantsOptions.regionFile.isEmpty()) { Path gffPath = Paths.get(queryVariantsOptions.regionFile); FileUtils.checkFile(gffPath); String regionsFromFile = Files.readAllLines(gffPath).stream().map(line -> { String[] array = line.split("\t"); return new String(array[0].replace("chr", "") + ":" + array[3] + "-" + array[4]); }).collect(Collectors.joining(",")); query.put(REGION.key(), regionsFromFile); } addParam(query, ID, queryVariantsOptions.id); addParam(query, GENE, queryVariantsOptions.gene); addParam(query, TYPE, queryVariantsOptions.type); List<String> studies = new LinkedList<>(); if (StringUtils.isNotEmpty(studiesFilter)) { query.put(STUDIES.key(), studiesFilter); for (String study : studiesFilter.split(",|;")) { if (!study.startsWith("!")) { studies.add(study); } } } // If the studies to be returned is empty then we return the studies being queried if (queryVariantsOptions.returnStudy != null && !queryVariantsOptions.returnStudy.isEmpty()) { // query.put(RETURNED_STUDIES.key(), Arrays.asList(queryVariantsOptions.returnStudy.split(","))); List<String> list = new ArrayList<>(); Collections.addAll(list, queryVariantsOptions.returnStudy.split(",")); query.put(RETURNED_STUDIES.key(), list); } else { if (!studies.isEmpty()) { query.put(RETURNED_STUDIES.key(), studies); } } addParam(query, FILES, queryVariantsOptions.file); addParam(query, RETURNED_FILES, queryVariantsOptions.returnFile); addParam(query, FILTER, queryVariantsOptions.filter); addParam(query, GENOTYPE, queryVariantsOptions.sampleGenotype); addParam(query, SAMPLES, queryVariantsOptions.samples); addParam(query, RETURNED_SAMPLES, queryVariantsOptions.returnSample); addParam(query, UNKNOWN_GENOTYPE, queryVariantsOptions.unknownGenotype); /** * Annotation parameters */ addParam(query, ANNOT_CONSEQUENCE_TYPE, queryVariantsOptions.consequenceType); addParam(query, ANNOT_BIOTYPE, queryVariantsOptions.geneBiotype); addParam(query, ANNOT_POPULATION_ALTERNATE_FREQUENCY, queryVariantsOptions.populationFreqs); addParam(query, ANNOT_POPULATION_MINOR_ALLELE_FREQUENCY, queryVariantsOptions.populationMaf); addParam(query, ANNOT_CONSERVATION, queryVariantsOptions.conservation); if (queryVariantsOptions.proteinSubstitution != null && !queryVariantsOptions.proteinSubstitution.isEmpty()) { String[] fields = queryVariantsOptions.proteinSubstitution.split(","); for (String field : fields) { String[] arr = field .replaceAll("==", " ") .replaceAll(">=", " ") .replaceAll("<=", " ") .replaceAll("=", " ") .replaceAll("<", " ") .replaceAll(">", " ") .split(" "); if (arr != null && arr.length > 1) { switch (arr[0]) { case "sift": query.put(ANNOT_SIFT.key(), field.replaceAll("sift", "")); break; case "polyphen": query.put(ANNOT_POLYPHEN.key(), field.replaceAll("polyphen", "")); break; default: query.put(ANNOT_PROTEIN_SUBSTITUTION.key(), field.replaceAll(arr[0], "")); break; } } } } /* * Stats parameters */ if (queryVariantsOptions.stats != null && !queryVariantsOptions.stats.isEmpty()) { Set<String> acceptedStatKeys = new HashSet<>(Arrays.asList(STATS_MAF.key(), STATS_MGF.key(), MISSING_ALLELES.key(), MISSING_GENOTYPES.key())); for (String stat : queryVariantsOptions.stats.split(",")) { int index = stat.indexOf("<"); index = index >= 0 ? index : stat.indexOf("!"); index = index >= 0 ? index : stat.indexOf("~"); index = index >= 0 ? index : stat.indexOf("<"); index = index >= 0 ? index : stat.indexOf(">"); index = index >= 0 ? index : stat.indexOf("="); if (index < 0) { throw new UnsupportedOperationException("Unknown stat filter operation: " + stat); } String name = stat.substring(0, index); String cond = stat.substring(index); if (acceptedStatKeys.contains(name)) { query.put(name, cond); } else { throw new UnsupportedOperationException("Unknown stat filter name: " + name); } logger.info("Parsed stat filter: {} {}", name, cond); } } addParam(query, STATS_MAF, queryVariantsOptions.maf); addParam(query, STATS_MGF, queryVariantsOptions.mgf); addParam(query, MISSING_ALLELES, queryVariantsOptions.missingAlleleCount); addParam(query, MISSING_GENOTYPES, queryVariantsOptions.missingGenotypeCount); boolean returnVariants = !count && StringUtils.isEmpty(queryVariantsOptions.groupBy) && StringUtils.isEmpty(queryVariantsOptions.rank); if (returnVariants && !of.isMultiStudyOutput()) { int returnedStudiesSize = query.getAsStringList(RETURNED_STUDIES.key()).size(); if (returnedStudiesSize == 0 && studies.size() == 1) { query.put(RETURNED_STUDIES.key(), studies.get(0)); } else if (returnedStudiesSize == 0 && allStudyNames.size() != 1 //If there are no returned studies, and there are more than one // study || returnedStudiesSize > 1) { // Or is required more than one returned study String availableStudies = allStudyNames == null || allStudyNames.isEmpty() ? "" : " Available studies: [ " + String.join(", ", allStudyNames) + " ]"; throw new Exception("Only one study is allowed when returning " + of + ", please use '--return-study' to select the returned " + "study." + availableStudies); } else { if (returnedStudiesSize == 0) { //If there were no returned studies, set the study existing one query.put(RETURNED_STUDIES.key(), allStudyNames.iterator().next()); } } } return query; } public static QueryOptions parseQueryOptions(StorageVariantCommandOptions.VariantQueryCommandOptions queryVariantsOptions) { QueryOptions queryOptions = new QueryOptions(new HashMap<>(queryVariantsOptions.commonOptions.params)); if (StringUtils.isNotEmpty(queryVariantsOptions.commonQueryOptions.include)) { queryOptions.add("include", queryVariantsOptions.commonQueryOptions.include); } if (StringUtils.isNotEmpty(queryVariantsOptions.commonQueryOptions.exclude)) { queryOptions.add("exclude", queryVariantsOptions.commonQueryOptions.exclude + ",_id"); } // else { // queryOptions.put("exclude", "_id"); // } if (queryVariantsOptions.commonQueryOptions.skip > 0) { queryOptions.add("skip", queryVariantsOptions.commonQueryOptions.skip); } if (queryVariantsOptions.commonQueryOptions.limit > 0) { queryOptions.add("limit", queryVariantsOptions.commonQueryOptions.limit); } if (queryVariantsOptions.commonQueryOptions.count) { queryOptions.add("count", true); } return queryOptions; } protected static void addParam(Query query, QueryParam key, Collection value) { if (CollectionUtils.isNotEmpty(value)) { query.put(key.key(), value); } } protected static void addParam(Query query, QueryParam key, String value) { if (StringUtils.isNotEmpty(value)) { query.put(key.key(), value); } } }