/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.generic; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.DatabaseType; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase; import org.ensembl.healthcheck.util.DBUtils; import org.ensembl.healthcheck.util.SqlTemplate; /** * Check that all top-level seq regions have some gene density features, and that the values agree between the * density_feature and seq_region attrib tables. Only checks top-level seq regions that do NOT have an _ in their names. Also checks * that there are some density features for each analysis/density type. Also checks that there are no duplicates in the * seq_region_attrib table. */ public class DensityFeatures extends SingleDatabaseTestCase { // max number of top-level seq regions to check private static final int MAX_TOP_LEVEL = 100; // map between analysis.logic_name and seq_region attrib_type.code @SuppressWarnings("rawtypes") private Map logicNameToAttribCode = new HashMap(); /** * Create a new DensityFeatures testcase. */ @SuppressWarnings("unchecked") public DensityFeatures() { setDescription("Check that all top-level seq regions have some gene density features, and that the values agree between the density_feature and seq_region attrib tables."); setFailureText("If the genome has been assembled using short-read sequences, some seq_regions might not have density_features"); logicNameToAttribCode.put("CodingDensity", "coding_cnt"); logicNameToAttribCode.put("PseudogeneDensity", "pseudogene_cnt"); logicNameToAttribCode.put("ShortNonCodingDensity", "noncoding_cnt_s"); logicNameToAttribCode.put("LongNonCodingDensity", "noncoding_cnt_l"); setTeamResponsible(Team.RELEASE_COORDINATOR); } // ---------------------------------------------------------------------- public void types() { removeAppliesToType(DatabaseType.OTHERFEATURES); removeAppliesToType(DatabaseType.ESTGENE); removeAppliesToType(DatabaseType.EST); removeAppliesToType(DatabaseType.CDNA); removeAppliesToType(DatabaseType.VEGA); removeAppliesToType(DatabaseType.RNASEQ); } /**Integer.valueOf( * Run the test. * * @param dbre * The database to use. * @return true if the test passed. * */ @SuppressWarnings("unchecked") public boolean run(DatabaseRegistryEntry dbre) { if (dbre.getType() == DatabaseType.SANGER_VEGA) { logicNameToAttribCode.put("PCodDensity", "knownGeneCount"); logicNameToAttribCode.remove("CodingDensity"); logicNameToAttribCode.remove("PseudogeneDensity"); logicNameToAttribCode.put("ShortNonCodingDensity", "noncoding_cnt_s"); logicNameToAttribCode.put("LongNonCodingDensity", "noncoding_cnt_l"); } boolean result = true; Connection con = dbre.getConnection(); SqlTemplate t = DBUtils.getSqlTemplate(dbre); // Density features needed only for species with a karyotype String sqlKaryotype = "SELECT count(*) FROM seq_region_attrib sa, attrib_type at WHERE at.attrib_type_id = sa.attrib_type_id AND code = 'karyotype_rank'"; int karyotype = t.queryForDefaultObject(sqlKaryotype, Integer.class); if (karyotype == 0) { return result; } result &= checkFeaturesAndCounts(con); result &= checkAnalysisAndDensityTypes(dbre); result &= checkDensityTypes(con); result &= checkFeatureSeqRegions(con); return result; } // run // ---------------------------------------------------------------------- @SuppressWarnings("rawtypes") private boolean checkFeaturesAndCounts(Connection con) { boolean result = true; // get top level co-ordinate system ID String sql = "SELECT coord_system_id FROM coord_system WHERE rank=1 LIMIT 1"; String s = DBUtils.getRowColumnValue(con, sql); if (s.length() == 0) { logger.warning("Error: can't get top-level co-ordinate system for " + DBUtils.getShortDatabaseName(con)); return false; } int topLevelCSID = Integer.parseInt(s); try { // check each top-level seq_region (up to a limit) to see how many density // features there are Statement stmt = con.createStatement(); ResultSet rs = stmt.executeQuery("SELECT s.seq_region_id, s.name, CASE WHEN ae.seq_region_id IS NULL THEN 0 ELSE 1 END as exception FROM seq_region_attrib sa, attrib_type at, seq_region s LEFT JOIN assembly_exception ae ON s.seq_region_id = ae.seq_region_id WHERE s.seq_region_id = sa.seq_region_id AND sa.attrib_type_id = at.attrib_type_id AND at.code = 'karyotype_rank' AND coord_system_id=" + topLevelCSID + " AND (exc_type IN ('HAP', 'PAR') or exc_type IS NULL) GROUP BY s.seq_region_id, s.name, exception"); int numTopLevel = 0; int noDensity = 0; while (rs.next() && numTopLevel++ < MAX_TOP_LEVEL) { long seqRegionID = rs.getLong("s.seq_region_id"); String seqRegionName = rs.getString("s.name"); boolean assemblyException = rs.getBoolean("exception"); logger.fine("Counting density features on seq_region " + seqRegionName); sql = "SELECT COUNT(*) FROM density_feature WHERE seq_region_id=" + seqRegionID; int dfRows = DBUtils.getRowCount(con, sql); if (dfRows == 0) { noDensity++; } // check each analysis type Iterator it = logicNameToAttribCode.keySet().iterator(); while (it.hasNext()) { String logicName = (String) it.next(); String attribCode = (String) logicNameToAttribCode.get(logicName); // check if this species has appropriate density features int analRows = DBUtils.getRowCount(con, "SELECT COUNT(*) FROM analysis WHERE logic_name='" + logicName + "'"); if (analRows == 0) { logger.fine(DBUtils.getShortDatabaseName(con) + " has no " + logicName + " analysis type, skipping checks for these features"); } else { // check that the sum of the density_feature.density_value matches // what // is in the seq_region_attrib table logger.fine("Comparing density_feature.density_value with seq_region_attrib for " + logicName + " features on " + seqRegionName); sql = "SELECT SUM(df.density_value) FROM density_type dt, density_feature df, analysis a WHERE dt.density_type_id=df.density_type_id AND dt.analysis_id=a.analysis_id AND a.logic_name='" + logicName + "' AND seq_region_id=" + seqRegionID; String sumDF = DBUtils.getRowColumnValue(con, sql); // System.out.println(sql + " " + sumDF); //don't check the sum for haplotypes or PAR regions if (sumDF != null && sumDF.length() > 0 && !assemblyException) { long sumFromDensityFeature = Long.parseLong(sumDF); sql = "SELECT value FROM seq_region_attrib sra, attrib_type at WHERE sra.attrib_type_id=at.attrib_type_id AND at.code='" + attribCode + "' AND seq_region_id=" + seqRegionID; String sumSRA = DBUtils.getRowColumnValue(con, sql); // System.out.println(sql + " " + sumSRA); if (sumSRA != null && sumSRA.length() > 0) { long valueFromSeqRegionAttrib = Long.parseLong(sumSRA); if (Math.abs(sumFromDensityFeature - valueFromSeqRegionAttrib) > 1000) { // allow a bit of leeway ReportManager.problem(this, con, "Sum of values for " + logicName + " from density_feature (" + sumFromDensityFeature + ") doesn't agree with value from seq_region_attrib (" + valueFromSeqRegionAttrib + ") for " + seqRegionName); result = false; } } // if sumSRA if (sumSRA.length() == 0) { ReportManager.problem(this, con, seqRegionName + " has no seq_region_attrib for " + attribCode); result = false; } } // if sumDF } // if rows } // while it } // while rs.next if (noDensity > 0) { ReportManager.problem(this, con, noDensity + " of the " + MAX_TOP_LEVEL + " first toplevel regions have no density features"); result = false; } rs.close(); stmt.close(); if (numTopLevel == MAX_TOP_LEVEL) { logger.warning("Only checked first " + numTopLevel + " seq_regions"); } } catch (SQLException se) { se.printStackTrace(); } return result; } // ---------------------------------------------------------------------- /** * Check that each analysis_id is used at least one one density_type. */ private boolean checkAnalysisAndDensityTypes(DatabaseRegistryEntry dbre) { boolean result = true; Connection con = dbre.getConnection(); // Species species = dbre.getSpecies(); String[] logicNames; if (dbre.getType() == DatabaseType.SANGER_VEGA) { logicNames = new String[] { "PCodDensity" }; } else { logicNames = new String[] { "PercentGC", "PercentageRepeat", "CodingDensity", "PseudogeneDensity", "ShortNonCodingDensity", "LongNonCodingDensity" }; } // check that each analysis_id is only used by one density_type for (int i = 0; i < logicNames.length; i++) { String logicName = logicNames[i]; String sql = "SELECT dt.density_type_id FROM analysis a, density_type dt WHERE a.analysis_id=dt.analysis_id AND a.logic_name='" + logicName + "'"; String[] rows = DBUtils.getColumnValues(con, sql); if (rows.length == 0) { if (dbre.getType() != DatabaseType.SANGER_VEGA || logicName.equalsIgnoreCase("knownGeneDensity")) {// for sangervega only // report analysis ReportManager.problem(this, con, "RelCo: No entry in density_type for analysis " + logicName + " - run density pipeline"); } result = false; } // note UNIQUE constraint prevents duplicate analysis_id/block_size values } return result; } // ---------------------------------------------------------------------- /** * Check for density_types that reference non-existent analysis_ids. */ private boolean checkDensityTypes(Connection con) { boolean result = true; String sql = "SELECT dt.density_type_id FROM density_type dt LEFT JOIN analysis a ON dt.analysis_id=a.analysis_id WHERE a.analysis_id IS NULL"; String[] rows = DBUtils.getColumnValues(con, sql); if (rows.length == 0) { ReportManager.correct(this, con, "All density_types reference existing analysis_ids"); } else { for (int j = 0; j < rows.length; j++) { ReportManager.problem(this, con, "density_type with ID " + rows[j] + " references non-existent analysis"); } result = false; } return result; } // ---------------------------------------------------------------------- /** * Check for density_features that link to non-existent seq_regions. */ private boolean checkFeatureSeqRegions(Connection con) { boolean result = true; logger.finest("Checking density_feature-seq_region links"); int orphans = countOrphans(con, "density_feature", "seq_region_id", "seq_region", "seq_region_id", true); if (orphans > 0) { ReportManager.problem(this, con, orphans + " density_features reference non-existent seq_regions"); result = false; } return result; } // ---------------------------------------------------------------------- } // DensityFeatures