/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.eg_core; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.List; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.EnsTestCase; import org.ensembl.healthcheck.testcase.eg_compara.AbstractControlledRows; import org.ensembl.healthcheck.util.SqlTemplate; import org.ensembl.healthcheck.util.SqlUncheckedException; import org.ensembl.healthcheck.util.SqlTemplate.ResultSetCallback; /** * * Checks whether all toplevel sequences in a core database are present as * dnafrag regions in the compara master database, if the genome is in * the compara master database. * * @author mnuhn * */ public class SeqRegionsConsistentWithComparaMaster extends AbstractControlledRows { final int reportMaxMissingRows = 20; protected Connection testDbConn; protected SqlTemplate sqlTemplateTestDb; protected void init(DatabaseRegistryEntry dbre) { super.init(); testDbConn = dbre.getConnection(); sqlTemplateTestDb = getSqlTemplate(testDbConn); setTeamResponsible(Team.ENSEMBL_GENOMES); } @Override protected boolean runTest(DatabaseRegistryEntry dbre) { init(dbre); List<Integer> allSpeciesIds = sqlTemplateTestDb.queryForDefaultObjectList( "select distinct species_id from meta where species_id is not null", Integer.class ); if (allSpeciesIds.size() == 0) { ReportManager.problem(this, testDbConn, "No species configured!"); } boolean allSpeciesPassed = true; for(int speciesId : allSpeciesIds) { allSpeciesPassed &= runTestForSpecies(dbre, speciesId); } return allSpeciesPassed; } protected boolean runTestForSpecies(DatabaseRegistryEntry dbre, int speciesId) { String productionName = fetchSingleMetaValueFor(sqlTemplateTestDb, speciesId, "species.production_name"); String assemblyDefault = fetchSingleMetaValueFor(sqlTemplateTestDb, speciesId, "assembly.default"); String genebuildStartDate = fetchSingleMetaValueFor(sqlTemplateTestDb, speciesId, "genebuild.start_date"); if (!speciesConfiguredForDnaCompara(productionName)) { getLogger().info("Skipping species " + productionName + ", because it is not linked to any method involving DNA comparisons in the compara master."); return true; } getLogger().info("Testing species " + productionName); boolean hasEntryInMasterDb = fetchHasGenomeDbId( productionName, assemblyDefault, genebuildStartDate ); if (!hasEntryInMasterDb) { ReportManager.correct(this, testDbConn, "Species " + productionName + " has no genome_db entry in the master database."); return true; } int genomeDbId = fetchGenomeDbId( productionName, assemblyDefault, genebuildStartDate ); int toplevelSeqRegionCount = fetchToplevelSeqRegionCount(); int dnaFragRowCountFor = fetchDnaFragRowCountFor(genomeDbId); boolean sequenceCountsOk = toplevelSeqRegionCount == dnaFragRowCountFor; if (sequenceCountsOk) { ReportManager.correct(this, testDbConn, "Sequence counts for this " + "species are " + toplevelSeqRegionCount + " both in " + "core and compara master database."); } else { ReportManager.problem(this, testDbConn, "Sequence counts for this " + "species are " + toplevelSeqRegionCount + " toplevel " + "sequence regions in the core database and " + dnaFragRowCountFor + " dna frags in the compara " + "master database. The counts should be equal.\n" + "This can happen, if the assembly has been changed, " + "but the assembly.default entry in the meta table has " + "not been changed. In that case the dna_frag table in " + "the compara master database is not updated " + "by the populate_mdb.pl script." ); } boolean allToplevelSeqRegionInDnaFragTable = assertToplevelSeqRegionInDnaFragTable(genomeDbId); return sequenceCountsOk && allToplevelSeqRegionInDnaFragTable; } protected int fetchToplevelSeqRegionCount() { List<Integer> numSeqRegionsList = sqlTemplateTestDb.queryForDefaultObjectList( "select count(*) from seq_region join seq_region_attrib using (seq_region_id) join attrib_type using (attrib_type_id) where code='toplevel'", Integer.class); assertLengthIsOne(numSeqRegionsList); Integer numSeqRegions = numSeqRegionsList.get(0); return numSeqRegions; } protected class SeqRegionData { public int seq_region_id; public String seq_region_name; public int seq_region_length; public String coord_system_name; public String toString() { return "seq_region_id = " + seq_region_id + "\n" + "seq_region.name = " + seq_region_name + "\n" + "seq_region.length = " + seq_region_length + "\n" + "coord_system.name = " + coord_system_name ; } } protected boolean assertToplevelSeqRegionInDnaFragTable(final int genomeDbId) { final EnsTestCase thisTest = this; Boolean allRowsExistInDnaFragTable = sqlTemplateTestDb.execute( "select" + " seq_region.seq_region_id, " + " seq_region.name, " + " seq_region.length, " + " coord_system.name " + "from " + " seq_region join seq_region_attrib using (seq_region_id) " + " join attrib_type using (attrib_type_id) " + " join coord_system using (coord_system_id) " + "where " + "code='toplevel' ", new ResultSetCallback<Boolean>() { @Override public Boolean process(ResultSet rs) throws SQLException { SeqRegionData seqRegionData = new SeqRegionData(); int missingRows = 0; boolean allRowsExistInDnaFragTable = true; while (rs.next()) { seqRegionData.seq_region_id = rs.getInt(1); seqRegionData.seq_region_name = rs.getString(2); seqRegionData.seq_region_length = rs.getInt(3); seqRegionData.coord_system_name = rs.getString(4); int numCorrespondingRowsInDnaFragTable = fetchNumCorrespondingRowsInDnaFragTable( seqRegionData, genomeDbId ); boolean currentRowExistsInDnaFragTable = false; if (numCorrespondingRowsInDnaFragTable == 1) { currentRowExistsInDnaFragTable = true; } if (numCorrespondingRowsInDnaFragTable == 0) { ReportManager.problem(thisTest, testDbConn, "The following seq region is not in the dnafrag table in the master database:\n" + seqRegionData); ReportManager.problem(thisTest, testDbConn, "The seq region that comes up with this sql in the core database:\n\n" + createUsefulSqlCore(seqRegionData) + "\n\n" + "should come up with this sql:\n\n" + createUsefulSqlMaster(seqRegionData, genomeDbId) ); missingRows++; if (missingRows>=reportMaxMissingRows) { ReportManager.problem(thisTest, testDbConn, "No more rows will be reported, because the maximum of " + reportMaxMissingRows + " has been reached."); return false; } currentRowExistsInDnaFragTable = false; } if (numCorrespondingRowsInDnaFragTable > 1) { throw new RuntimeException("Unexpected value for numCorrespondingRowsInDnaFragTable:" + numCorrespondingRowsInDnaFragTable); } allRowsExistInDnaFragTable &= currentRowExistsInDnaFragTable; } return allRowsExistInDnaFragTable; } }, // No bound parameters // new Object[0] ); return allRowsExistInDnaFragTable; } protected String createUsefulSqlMaster(final SeqRegionData seqRegionData, final int genomeDbId) { return "select * " + "from dnafrag " + "where genome_db_id = "+genomeDbId+" " + "and name = '" + seqRegionData.seq_region_name + "' " + "and length = " + seqRegionData.seq_region_length + " " + "and coord_system_name='" + seqRegionData.coord_system_name + "'"; } protected String createUsefulSqlCore(final SeqRegionData seqRegionData) { return "select \n" + " seq_region.seq_region_id, \n" + " seq_region.name, \n" + " seq_region.length, \n" + " coord_system.name \n" + "from \n" + " seq_region join seq_region_attrib using (seq_region_id) \n" + " join attrib_type using (attrib_type_id) \n" + " join coord_system using (coord_system_id) \n" + "where \n" + " code='toplevel' \n" + " and seq_region_id="+seqRegionData.seq_region_id+"; \n" ; } protected int fetchNumCorrespondingRowsInDnaFragTable( final SeqRegionData seqRegionData, final int genomeDbId ) { List<Integer> numSeqRegionsInDnaFragTableList = masterSqlTemplate.queryForDefaultObjectList( "select " + " count(*) " + "from " + " dnafrag " + "where " + " genome_db_id=?" + " and name=?" + " and length=?" + " and coord_system_name=?", Integer.class, genomeDbId, seqRegionData.seq_region_name, seqRegionData.seq_region_length, seqRegionData.coord_system_name ); assertLengthIsOne(numSeqRegionsInDnaFragTableList); return numSeqRegionsInDnaFragTableList.get(0); } protected void assertLengthIsOne(List<?> list) { if (list.size()>1) { throw new RuntimeException("Got more than one return value. Expected only one!"); } if (list.size()==0) { throw new RuntimeException("Got no return value!"); } } protected int fetchDnaFragRowCountFor(int genomeDbId) { List<Integer> metaValueList = masterSqlTemplate.queryForDefaultObjectList( "select count(*) from dnafrag where genome_db_id=" + genomeDbId, Integer.class); assertLengthIsOne(metaValueList); return metaValueList.get(0); } /** * @param productionName * @param assemblyDefault * @param genebuildStartDate */ protected int fetchGenomeDbId( String productionName, String assemblyDefault, String genebuildStartDate) { ResultSet rs = fetchFromGenomeDbId(productionName, assemblyDefault, genebuildStartDate, "genome_db_id"); int genomeDbId; try { boolean hasResult = rs.next(); if (!hasResult) { throw new RuntimeException("Can't fetch Species " + productionName + " from genome_db table!"); } genomeDbId = rs.getInt(1); } catch (SQLException e) { throw new RuntimeException(e); } return genomeDbId; } /** * Check in master database, if there is a genome_db entry for this * species. */ protected boolean fetchHasGenomeDbId( String productionName, String assemblyDefault, String genebuildStartDate) { ResultSet rs = fetchFromGenomeDbId(productionName, assemblyDefault, genebuildStartDate, "count(genome_db_id)"); int genomeDbId; try { boolean hasResult = rs.next(); if (!hasResult) { throw new RuntimeException("Can't count rows on genome_db table!"); } genomeDbId = rs.getInt(1); } catch (SQLException e) { throw new RuntimeException(e); } if (genomeDbId==1) { return true; } if (genomeDbId==0) { return false; } throw new RuntimeException("Unexpected number of matching rows for " + productionName + " in master database!"); } protected ResultSet fetchFromGenomeDbId( String productionName, String assemblyDefault, String genebuildStartDate, String column ) { String sql = "select "+column+" from genome_db where name=? and assembly=? and genebuild=? and genome_component IS NULL"; ResultSet rs = null; try { PreparedStatement stmt = this.masterDbConn.prepareStatement(sql); stmt.setString(1, productionName); stmt.setString(2, assemblyDefault); stmt.setString(3, genebuildStartDate); rs = stmt.executeQuery(); } catch (SQLException e) { throw new SqlUncheckedException(e.getMessage()); } return rs; } /** * @param sqlTemplateTestDb * @param metaKey * @return metaValue */ protected String fetchSingleMetaValueFor( final SqlTemplate sqlTemplateTestDb, int speciesId, String metaKey ) { String sql = "select meta_value from meta where meta.meta_key = '"+metaKey+"' and species_id="+speciesId; List<String> metaValueList = sqlTemplateTestDb.queryForDefaultObjectList( sql, String.class ); if (metaValueList.size()>1) { throw new RuntimeException("Got more than one meta_value for metaKey "+metaKey+". Expected only one!\n"+sql); } if (metaValueList.size()==0) { throw new RuntimeException("Metakey "+metaKey+" is missing in the meta table!\n"+sql); } String metaValue = metaValueList.get(0); return metaValue; } }