/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.generic; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.DatabaseType; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.Species; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase; import org.ensembl.healthcheck.util.DBUtils; /** * */ public class Ditag extends SingleDatabaseTestCase { // max number of top-level seq regions to check private static final int MAX_TOP_LEVEL = 100; /** * Creates a new instance of Ditag. */ public Ditag() { setDescription("Checks that ditag_features exist, that they all have a ditag entry and that all chromosomes have some ditag_features"); setTeamResponsible(Team.GENEBUILD); } /** * This test only applies to core databases. */ public void types() { removeAppliesToType(DatabaseType.OTHERFEATURES); removeAppliesToType(DatabaseType.ESTGENE); removeAppliesToType(DatabaseType.VEGA); removeAppliesToType(DatabaseType.SANGER_VEGA); removeAppliesToType(DatabaseType.CDNA); removeAppliesToType(DatabaseType.RNASEQ); } /** * Test various things about ditag features. * * @param dbre * The database to use. * @return Result. */ public boolean run(DatabaseRegistryEntry dbre) { boolean result = true; Connection con = dbre.getConnection(); // only check for human, mouse Species s = dbre.getSpecies(); if (s.equals(Species.HOMO_SAPIENS) || s.equals(Species.MUS_MUSCULUS) || s.equals(Species.ORYZIAS_LATIPES)) { result &= checkExistance(con); // If there are no ditag records, don't make a lot of noise // saying they're missing for each chromosome if(!result) { return false; } result &= checkDitagRelation(con); result &= checkAllChromosomesHaveDitagFeatures(con); result &= checkForSingles(con); result &= checkForMultis(con); } return result; } // ---------------------------------------------------------------------- /* * Verify ditags & ditag features exist. */ private boolean checkExistance(Connection con) { boolean result = true; int rowCount1 = DBUtils.getRowCount(con, "SELECT * FROM ditag LIMIT 10"); if (rowCount1 == 0) { ReportManager.problem(this, con, "No ditags in databaset"); // If there are no ditags in the dataset, we don't care if // there are features, that's irrelevant, skip return false; } int rowCount2 = DBUtils.getRowCount(con, "SELECT * FROM ditag_feature LIMIT 10"); if (rowCount2 == 0) { ReportManager.problem(this, con, "No ditag features in databaset"); result = false; } if (result) { ReportManager.correct(this, con, "Found entries in ditag & ditag_feature tables."); } return result; } // ---------------------------------------------------------------------- /** * Check that all ditag_features have a ditag entry and that there are not more ditag mappings than allowed. */ private boolean checkDitagRelation(Connection con) { boolean result = true; int count = DBUtils.getRowCount(con, "SELECT COUNT(*) FROM ditag_feature df LEFT JOIN ditag d ON d.ditag_id=df.ditag_id WHERE d.ditag_id IS NULL"); if (count > 0) { ReportManager.problem(this, con, " There are " + count + " ditag_features without ditag entry."); result = false; } else { ReportManager.correct(this, con, "All ditag_features have ditag entries."); } return result; } // ---------------------------------------------------------------------- /** * Check that all chromomes have > 0 ditag_features. */ private boolean checkAllChromosomesHaveDitagFeatures(Connection con) { boolean result = true; // find all the chromosomes, and for each one check that it has some markers // note a "chromosome" is assumed to be a seq_region that is: // - on the top-level co-ordinate system and // - doesn't have and _ or . in the name and // - has a seq_region name of less than 3 characters // - doesn't have a name starting with "Un" or "MT" // get top level co-ordinate system ID String sql = "SELECT coord_system_id FROM coord_system WHERE rank=1 LIMIT 1"; String s = DBUtils.getRowColumnValue(con, sql); if (s.length() == 0) { System.err.println("Error: can't get top-level co-ordinate system for " + DBUtils.getShortDatabaseName(con)); return false; } int topLevelCSID = Integer.parseInt(s); try { // check each top-level seq_region (up to a limit) to see how many // marker_map_locations and marker features there are Statement stmt = con.createStatement(); ResultSet rs = stmt.executeQuery("SELECT * FROM seq_region WHERE coord_system_id=" + topLevelCSID + " AND name NOT LIKE '%\\_%' AND name NOT LIKE '%.%' AND name NOT LIKE 'Un%' AND name NOT LIKE 'MT%' AND LENGTH(name) < 3 ORDER BY name"); int numTopLevel = 0; while (rs.next() && numTopLevel++ < MAX_TOP_LEVEL) { long seqRegionID = rs.getLong("seq_region_id"); String seqRegionName = rs.getString("name"); // check ditag_features logger.fine("Counting ditag_features on chromosome " + seqRegionName); sql = "SELECT COUNT(*) FROM ditag_feature WHERE seq_region_id=" + seqRegionID; int rows = DBUtils.getRowCount(con, sql); if (rows == 0) { ReportManager.problem(this, con, "Chromosome " + seqRegionName + " (seq_region_id " + seqRegionID + ") has no ditag_features"); result = false; } else { ReportManager.correct(this, con, "Chromosome " + seqRegionName + " has " + rows + " ditag_features"); } } rs.close(); stmt.close(); if (numTopLevel == MAX_TOP_LEVEL) { logger.warning("Only checked first " + numTopLevel + " seq_regions"); } } catch (SQLException se) { se.printStackTrace(); } return result; } // ---------------------------------------------------------------------- /** * Check that all ditags (that are not CAGE tags) have start AND end tags mapped. */ private boolean checkForSingles(Connection con) { boolean result = true; String analysis_ids = ""; try { // Get the analysis ids of the ditags Statement stmt = con.createStatement(); ResultSet rs = stmt.executeQuery("SELECT DISTINCT analysis_id FROM ditag_feature;"); while (rs.next()) { String analysis_id = rs.getString("analysis_id"); if (analysis_ids.length() > 0) { analysis_ids = analysis_ids + ", " + analysis_id; } else { analysis_ids = analysis_id + " "; } } if (analysis_ids.length() == 0) { return true; } // Check for ditag_ids that occur only once, ignore CAGE tags ("F") String sql = "SELECT COUNT(*) AS singles FROM (select count(*) as count from ditag_feature df where analysis_id IN(" + analysis_ids + ") and df.ditag_side!='F' group by ditag_id, ditag_pair_id having count=1) as counter LIMIT 5;"; int count = 0; rs = stmt.executeQuery(sql); rs.next(); count = rs.getInt("singles"); if (count > 0) { ReportManager.problem(this, con, " There are ditag_features without a partner (start/end)!"); result = false; } else { ReportManager.correct(this, con, "All ditag_features have ditag partners (start/end)."); } } catch (SQLException se) { se.printStackTrace(); } return result; } // ---------------------------------------------------------------------- /** * Check that no ditags have more than 2 ditag_features with the same ditag_id & ditag_pair_id */ private boolean checkForMultis(Connection con) { boolean result = true; String analysis_ids = ""; try { // Get the analysis ids of the ditags Statement stmt = con.createStatement(); ResultSet rs = stmt.executeQuery("SELECT DISTINCT analysis_id FROM ditag_feature;"); while (rs.next()) { String analysis_id = rs.getString("analysis_id"); if (analysis_ids.length() > 0) { analysis_ids = analysis_ids + ", " + analysis_id; } else { analysis_ids = analysis_id + " "; } } if (analysis_ids.length() == 0) { return true; } // Check for ditag_ids that occur only once, ignore CAGE tags ("F") String sql = "SELECT COUNT(*) AS multis FROM (select count(*) as count from ditag_feature df where analysis_id IN(" + analysis_ids + ") and df.ditag_side!='F' group by ditag_id, ditag_pair_id having count>2 LIMIT 1) as counter;"; int count = 0; rs = stmt.executeQuery(sql); rs.next(); count = rs.getInt("multis"); if (count > 0) { ReportManager.problem(this, con, " There are ditag_features with more than two features " + "in same (ditag_id/ditag_pair_id) group!.\n" + sql); result = false; } else { ReportManager.correct(this, con, "All ditag_features groups have 2 partners (start/end)."); } } catch (SQLException se) { se.printStackTrace(); } return result; } }