/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.variation; import java.sql.Connection; import java.util.ArrayList; import java.util.List; import org.ensembl.healthcheck.DatabaseRegistry; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.DatabaseType; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.MultiDatabaseTestCase; import org.ensembl.healthcheck.util.DBUtils; /** * An EnsEMBL Healthcheck test case that looks for broken foreign-key relationships between core and variation database. */ public class VFCoordinates extends MultiDatabaseTestCase { /** * Create an ForeignKeyCoreId that applies to a specific set of databases. */ public VFCoordinates() { /* addToGroup("variation-release"); */ setDescription("Check for possible wrong coordinates in Vf table, due to wrong length or outside range seq_region."); // setHintLongRunning(true); setTeamResponsible(Team.VARIATION); } /** * Run the test. * * @param dbr * Registry containing the databases to check, in order core->variation * @return true if same transcripts and seq_regions in core and variation are the same. * */ public boolean run(DatabaseRegistry dbr) { boolean allResult = true; DatabaseRegistryEntry[] variationDBs = dbr.getAll(DatabaseType.VARIATION); for (int i = 0; i < variationDBs.length; i++) { boolean result = true; DatabaseRegistryEntry dbrvar = variationDBs[i]; String variationName = dbrvar.getName(); // the database registry parameter dbr only contains the databases matching the regular expression passed on the command line // so create a database registry containing all the core databases and find the one we want List<String> coreRegexps = new ArrayList<String>(); coreRegexps.add(".*_core_.*"); DatabaseRegistry allDBR = new DatabaseRegistry(coreRegexps, null, null, false); String coreName = variationName.replaceAll("variation", "core"); DatabaseRegistryEntry dbrcore = allDBR.getByExactName(coreName); if (dbrcore == null) { logger.severe("Incorrect core database " + coreName + " for " + variationName); return false; } Connection con = dbrvar.getConnection(); System.out.println("Using " + coreName + " as core database and " + variationName + " as variation database"); int mc = DBUtils.getRowCount( con, "SELECT COUNT(*) FROM " + variationName + ".variation_feature vf LEFT JOIN " + variationName + ".failed_variation f ON vf.variation_id = f.variation_id WHERE f.variation_id IS NULL AND length(vf.allele_string) = 3 and vf.seq_region_start<> vf.seq_region_end and vf.allele_string NOT LIKE '%-%'"); if (mc > 0) { ReportManager.problem(this, con, "Wrong allele length !! (allele_string <> coordinates length) for " + mc + " entries in " + variationName); result = false; } mc = DBUtils.getRowCount(con, "SELECT COUNT(*) FROM " + coreName + ".seq_region s, " + variationName + ".variation_feature vf WHERE vf.seq_region_id = s.seq_region_id AND vf.seq_region_end > s.length"); if (mc > 0) { ReportManager.problem(this, con, "Variation Features outside range in " + variationName); result = false; } mc = DBUtils.getRowCount(con, "SELECT COUNT(*) FROM " + variationName + ".variation_feature vf WHERE vf.seq_region_start = 1 AND vf.seq_region_end > 1"); if (mc > 0) { ReportManager.problem(this, con, "Variation Features with coordinates = 1 " + variationName); result = false; } // Check that no VFs are on the negative strand, unless they have map_weight > 1 and/or are located on non-reference // seq_regions or correspond to CNV probes String vfId = DBUtils.getRowColumnValue(con, "SELECT vf.variation_feature_id FROM " + variationName + ".variation_feature vf WHERE vf.seq_region_strand = -1 AND vf.map_weight = 1 AND vf.allele_string NOT LIKE 'CNV_PROBE' AND NOT EXISTS (SELECT * FROM " + coreName + ".seq_region_attrib sra JOIN " + coreName + ".attrib_type at USING (attrib_type_id) WHERE sra.seq_region_id = vf.seq_region_id AND at.code = 'non_ref') LIMIT 1"); if (vfId.length() > 0) { ReportManager.problem(this, con, "Variation Features on the negative strand (e.g. variation_feature_id = " + vfId + ") in " + variationName); result = false; } // Check that no VFs are duplicated mc = DBUtils.getRowCount(con, "SELECT COUNT(DISTINCT vf1.variation_id) FROM " + variationName + ".variation_feature vf1 JOIN " + variationName + ".variation_feature vf2 USING (variation_id,seq_region_id,seq_region_start,seq_region_end,seq_region_strand) WHERE vf1.variation_feature_id < vf2.variation_feature_id"); if (mc > 0) { ReportManager.problem(this, con, "There are duplicated Variation Features for " + String.valueOf(mc) + " variations in " + variationName); result = false; } if (result) { ReportManager.correct(this, con, "VFCoordinates test run successfully"); } allResult = (allResult && result); } return allResult; } /** * This only applies to variation databases. */ public void types() { removeAppliesToType(DatabaseType.OTHERFEATURES); removeAppliesToType(DatabaseType.CDNA); removeAppliesToType(DatabaseType.CORE); removeAppliesToType(DatabaseType.VEGA); } } // VFCoordinates