/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.funcgen; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.util.*; import java.util.regex.Pattern; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.DatabaseType; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase; import org.ensembl.healthcheck.util.DBUtils; import org.ensembl.healthcheck.Team; /** * Check Array xrefs: - that each chromosome has at least 1 Probe/Set xref * * Assumptions: Array Probe/ProbeSet xrefs and transcripts are both in the default chromosome coordinate system. * */ /** * To do * 1 Add support for Probe level xrefs * 2 Group counts by array? This is already done in ComparePreviousVersionArraysXrefs? */ public class ArrayXrefs extends SingleDatabaseTestCase { // if a database has more than this number of seq_regions in the chromosome coordinate system, it's ignored private static final int MAX_CHROMOSOMES = 75; /** * Creates a new instance of OligoXrefs */ public ArrayXrefs() { //addToGroup("post_genebuild"); addToGroup("funcgen"); addToGroup("funcgen-release"); setTeamResponsible(Team.FUNCGEN); setDescription("Check Array probe2transcript xrefs"); setHintLongRunning(true); } public void types() { removeAppliesToType(DatabaseType.CORE); removeAppliesToType(DatabaseType.OTHERFEATURES); removeAppliesToType(DatabaseType.CDNA); removeAppliesToType(DatabaseType.VEGA); } /** * Check all chromosomes have xrefs for each DISPLAYABLE array. * * Get a list of chromosomes, then check the number of xrefs associated with each one. Fail is any chromosome has 0 xrefs. * Is this even possible now we have transcripts in a separate DB? * This is really essential for new arrays, as we can't rely on ComparePrevious * Could do this via cross DB query only if on same server * * @param dbre * The database to use. * @return true if the test passed. * */ public boolean run(DatabaseRegistryEntry dbre) { if (Pattern.matches("master_schema_funcgen_\\d+", dbre.getName())) { logger.fine("Skipping " + dbre.getName()); return true; } boolean result = true; Connection efgCon = dbre.getConnection(); // check that all arrays (except Illumina Infinium) have // MART_DISPLAYABLE status try { ResultSet rs = efgCon.createStatement().executeQuery("select a" + ".array_id, a.name, a.class, s.status_name_id from array " + "a left join status s on a.array_id=s.table_id and s" + ".table_name='array' and s.status_name_id=(select " + "status_name_id from status_name where " + "name='MART_DISPLAYABLE') where status_name_id is NULL " + "and a.class!='ILLUMINA_INFINIUM'"); while (rs.next()) { int arrayID = rs.getInt(1); String arrayName = rs.getString(2); ReportManager.problem(this, efgCon, "Array " + arrayID + " " + arrayName + " has no MART_DISPLAYABLE status"); result = false; } rs.close(); } catch (SQLException e) { e.printStackTrace(); result = false; } /** To do * Change xref counting to include all db versions, warn if more than one and fail if some are missing * Write perl script to log counts? */ // Check if there are any DISPLAYABLE Arrays - if so there should be Xrefs // Checks EPXRESSION and CGH arrays only // Integer displayableArrays = DBUtils.getRowCount(efgCon, "SELECT COUNT(*) FROM array a, status s, status_name sn where sn.name='DISPLAYABLE' and " + // "sn.status_name_id=s.status_name_id and s.table_name='array' and s.table_id=a.array_id"); int expressionArrays = DBUtils.getRowCount(efgCon, "SELECT COUNT(*) FROM array a where (format='EXPRESSION' OR format='CGH')"); if ( expressionArrays == 0) { //Assume we should always have EXPRESSION arrays ReportManager.problem(this, efgCon, DBUtils.getShortDatabaseName(efgCon) + " has no EXPRESSION Arrays, not checking for probe2transcript xrefs"); return false; } else{ ReportManager.correct(this, efgCon, DBUtils.getShortDatabaseName(efgCon) + " has " + expressionArrays + " EXPRESSION|CGH arrays"); } // if ( displayableArrays < expressionArrays ){ // ReportManager.problem(this, efgCon, "Database contains non-DISPLAYABLE EXPRESSION Arrays"); // result = false; // } StringBuffer hiddenArrays = new StringBuffer(); try { ResultSet rs = efgCon.createStatement().executeQuery("SELECT a.name, sn1.name from array a left join " + "(SELECT s.table_id, sn.name from status s, status_name sn where sn.name='DISPLAYABLE' and " + "sn.status_name_id=s.status_name_id and s.table_name='array') sn1 on sn1.table_id=a.array_id " + "WHERE (a.format ='EXPRESSION' OR a.format='CGH')"); while (rs.next()){ String arrayStatus = rs.getString(2); String arrayName = rs.getString(1); if (arrayStatus == null) hiddenArrays.append(arrayName + " "); } rs.close(); if(hiddenArrays.length() != 0){ result = false; ReportManager.problem(this, efgCon, "Database contains non-DISPLAYABLE EXPRESSION Arrays:\t" + hiddenArrays); } else{ ReportManager.correct(this, efgCon, "All EXPRESSION|CGH arrays in " + DBUtils.getShortDatabaseName(efgCon) + " are DISPLAYABLE"); } } catch (SQLException se) { se.printStackTrace(); return false; } //Get the matching core dbre to do the cross DB join //Assume we have the standard name for the core DB and it is on the same host String schemaBuild = dbre.getSchemaVersion() + "_" + dbre.getGeneBuildVersion(); String coreDBName = getCoreDbName(dbre, schemaBuild); //Never get's loaded if we specify a pattern //DatabaseRegistryEntry coreDbre = dbre.getDatabaseRegistry().getByExactName(coreDBName); System.out.println("Getting DBRE by pattern:\t" + coreDBName); DatabaseRegistryEntry coreDbre = getDatabaseRegistryEntryByPattern(coreDBName); //This may still not be on the same server if database.properties has two servers configured if (coreDbre == null){ ReportManager.problem(this, efgCon, "Could not default core DB:\t" + coreDBName); return false; } //And test the hosts are the same //assume user/pass will be able to access DBs on the same server //do not test DatabaseServer object, it may be a different if (! coreDbre.getDatabaseServer().getDatabaseURL().equals(dbre.getDatabaseServer().getDatabaseURL())){ ReportManager.problem(this, efgCon, "Unable to perform chromosome xref counts as efg and core DB are not on the same DatabaseServer:\t" + "core " + coreDbre.getDatabaseServer().getDatabaseURL() + "\tefg " + dbre.getDatabaseServer().getDatabaseURL()); return false; } // find all chromosomes in default assembly coordinate system // should really be parameterized Map<String, String> srID2name = new HashMap<String, String>(); Map<String, String> coreSrID2efg = new HashMap<String, String>(); // Die if we don't see the current schema build and is the only one that is_current // Otherwise we cannot be sure that all seq_region records have been updated String csName = DBUtils.getRowColumnValue(coreDbre.getConnection(), "SELECT name FROM coord_system order by rank desc limit 1"); if(csName == null){ ReportManager.problem(this, efgCon, "Could not identify coord_system entries for schema_build:\t" + schemaBuild); return false; } try { ResultSet rs = efgCon.createStatement().executeQuery("SELECT coord_system_id, rank, schema_build " + "FROM coord_system WHERE is_current=1 and schema_build is not null AND name='" + csName + "'"); // Should never have null schema_build entries int csRank = 999999999; String csID = ""; // Can't just declare here and init in the while as this causes a compilation error while(rs.next()){ if (! rs.getString(3).equals(schemaBuild)){ ReportManager.problem(this, efgCon, "Found an 'is_current' " + csName + "coord_system with unexpected schema_build:\t" + rs.getString(3)); return false; } // Do we need a attrib 'default' check here too? // Get highest ranking csID if (rs.getInt(2) < csRank){ csRank = rs.getInt(2); csID = rs.getString(1); } } rs.close(); rs = efgCon.createStatement().executeQuery("SELECT s.seq_region_id, s.name, s.core_seq_region_id " + "FROM seq_region s WHERE s.coord_system_id=" + csID + " and s.name not like '%\\_%' group by s.seq_region_id"); //Do we even need this core_seq_region_id translation? //Just link via the sr.name! while (rs.next()){ srID2name.put(rs.getString(1), rs.getString(2)); coreSrID2efg.put(rs.getString(3), rs.getString(1)); } rs.close(); if (srID2name.size() > MAX_CHROMOSOMES) { ReportManager.info(this, efgCon, "Database has " + srID2name.size() + " seq_regions in 'chromosome' coordinate system"); // return false; // No longer skip here } // Count the number of xrefs for each chr Map<String, String> coreSrIDcounts = new HashMap<String, String>(); // (Optimisation: faster to use "in list" of external_db_ids than SQL // join.) StringBuffer inList = new StringBuffer(); String edbName = dbre.getSpecies() + "_core_Transcript"; String[] assemblyBuild = schemaBuild.split("_"); String xrefQuery = ""; String edbClause = ""; //We really need to match the genebuild between the edb and the schema_build //otherwise we have out of date data? //Not enirely true, there are plenty of data changes which can cause a version bump which don't affect array mapping //Let's just get all of them first, and warn if there are any which don't match the assemblyBuild //Update to count and list counts for each obj type, db_release and analysis_id? String [] exdbIDs = DBUtils.getColumnValues ( efgCon, "select distinct edb.external_db_id from external_db edb " + " join xref x on edb.external_db_id=x.external_db_id join object_xref ox on x.xref_id=ox.xref_id " + " and ox.ensembl_object_type in ('Probe', 'ProbeFeature', 'ProbeSet')" ); //need edb ids here as edbClause is used elsewhere //Update to count and list counts for each obj type, db_release and analysis_id? //select edb.db_release, ox.ensembl_object_type, count(*), edb.external_db_id, a.logic_name from external_db edb join xref x on edb.external_db_id=x.external_db_id join object_xref ox on x.xref_id=ox.xref_id and ox.ensembl_object_type in ('Probe', 'ProbeFeature', 'ProbeSet') join analysis a on ox.analysis_id=a.analysis_id group by edb.db_release, ox.ensembl_object_type, a.logic_name; //Catch absent edbs if(exdbIDs.length == 0){ ReportManager.problem(this, efgCon, "Could not identify external_db " + edbName + " with associated Probe, ProbeFeature or ProbeSet object_xrefs"); result = false; } else{ //Handle mutliple edbs for (int i = 0; i < exdbIDs.length; i++) { inList.append(exdbIDs[i]); if(i != (exdbIDs.length -1)){ inList.append(","); } } edbClause = "and x.external_db_id in (" + inList + ")"; String dbReleases [] = DBUtils.getColumnValues (efgCon, "select db_release from external_db where external_db_id in(" + inList + ")"); if(exdbIDs.length == 1){ // Test it matches the assembly/build if(! dbReleases[0].matches(".*" + assemblyBuild[1]) ){ ReportManager.warning(this, efgCon, "Xrefs are associated with an external_db which does not match the current assembly/build:\t" + Arrays.toString(dbReleases)); //This is actually unsafe now. As we can't identify whether a transcript set has been updated //simply using the schema_build. Will have to use meta keys } } else{ // >1 simply list all the db_releases //This maybe valid if we have ProbeFeature genomic and transcript xrefs on different releases ReportManager.warning(this, efgCon, "Found multiple external_db db_release versions with xrefs:\n\t" + Arrays.toString(dbReleases)); //These may not also match the current build, but not test here } } //Set which objects we are looking for i.e. Probe or ProbeSets int[] xrefObjects = new int[2]; xrefObjects[0] = DBUtils.getRowCount(efgCon, "select count(*) from array where vendor='AFFY'"); xrefObjects[1] = DBUtils.getRowCount(efgCon, "select count(*) from array where vendor!='AFFY'"); for (int i = 0; i <= 1; i++) { String xrefObj = (i == 0) ? "ProbeSet" : "Probe"; String arrayVendor = (i == 0) ? "Affy" : "Non-Affy"; if(xrefObjects[i] == 0){ ReportManager.info(this, efgCon, "Has no " + arrayVendor + " arrays. " + " " + xrefObj + " xref counts will be skipped"); //result = false; This should be a wrning rathe than a problem as we are not returning false? } else{ xrefQuery = "select t.seq_region_id, count(*) as count from " + coreDBName + ".transcript t, " + " object_xref ox, xref x " + "where t.stable_id=x.dbprimary_acc " + "and ox.ensembl_object_type='" + xrefObj + "' and ox.xref_id=x.xref_id " + edbClause + " GROUP BY t.seq_region_id"; //System.out.println(xrefQuery); //Restrict this to the core_coord_system_ids for the specific DB //other wise we may get odd counts where core_coord_system_ids have changed between releases on the same assembly ResultSet xrefCounts = efgCon .createStatement() .executeQuery(xrefQuery); //Capture empty set here boolean seenResults = false; while (xrefCounts.next()){ seenResults = true; coreSrIDcounts.put(xrefCounts.getString(1), xrefCounts.getString(2)); } rs.close(); if(seenResults == false){ result = false; //Change this to info and remove false? //This maybe that the new external_db exists, but the xrefs were loaded using an old version ReportManager.problem(this, efgCon, "Found " + arrayVendor + " " + xrefObj + " arrays but no associated transcript xrefs for external_db like " + assemblyBuild[1] + "\nXrefs maybe present on a previous version"); //redo this without assembly build clause, or just select and group by it in the first query and test in loop } else{ // check every chr has >0 xrefs. for (Iterator<String> iter = coreSrIDcounts.keySet().iterator(); iter.hasNext();) { String coreSrID = (String) iter.next(); String efgSrID = (String) coreSrID2efg.get(coreSrID); String name = (String) srID2name.get(efgSrID); //System.out.println("core " + coreSrID + " efg " + efgSrID + " name " + name); //Skip nulls as these won't be chromosomes //But may have xrefs if ( name != null ){ String label = name + " (seq_region_id=" + efgSrID + ")"; long count = coreSrIDcounts.containsKey(coreSrID) ? Long.parseLong(coreSrIDcounts.get(coreSrID).toString()) : 0; //System.out.println(label + " " + count); if (count > 0) { ReportManager.correct(this, efgCon, "Chromosome " + label + " has " + coreSrIDcounts.get(coreSrID) + " associated " + xrefObj + " array xrefs."); } else { ReportManager.problem(this, efgCon, "Chromosome " + label + " has no associated " + xrefObj + " array xrefs."); result = false; } } } } } } } catch (SQLException se) { se.printStackTrace(); result = false; } return result; } // run protected String getCoreDbName(DatabaseRegistryEntry dbre, String schemaBuild) { String coreDBName = dbre.getSpecies() + "_core_" + schemaBuild; return coreDBName; } } // ArrayXrefs