/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.compara; import java.sql.Connection; import java.sql.ResultSet; import java.sql.Statement; import java.util.ArrayList; import java.util.Map; import java.util.Vector; import org.ensembl.healthcheck.DatabaseRegistry; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.DatabaseType; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.Species; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.compara.AbstractComparaTestCase; import org.ensembl.healthcheck.util.DBUtils; /** * Check compara taxon table against core meta ones. */ public class CheckTaxon extends AbstractComparaTestCase { /** * Create a new instance of MetaCrossSpecies */ public CheckTaxon() { setDescription("Check that the attributes of the taxon table (genus, species," + " common_name and classification) correspond to the meta data in the core DB and vice versa."); setTeamResponsible(Team.COMPARA); } /** * Check that the attributes of the taxon table (genus, species, common_name and * classification) correspond to the meta data in the core DB and vice versa. * NB: A warning message is displayed if some dnafrags cannot be checked because * there is not any connection to the corresponding core database. * * @param comparaDbre * The database registry containing all the specified databases. * @return true if the all the taxa in compara.taxon table which have a counterpart in * the compara.genome_db table match the corresponding core databases. */ public boolean run(DatabaseRegistryEntry comparaDbre) { boolean result = true; result &= checkTaxon(comparaDbre); return result; } /** * Check that the attributes of the taxon table (genus, species, common_name and * classification) correspond to the meta data in the core DB and vice versa. * NB: A warning message is displayed if some dnafrags cannot be checked because * there is not any connection to the corresponding core database. * * @param comparaDbre * The database registry entry for Compara DB * @return true if the all the taxa in compara.taxon table which have a counterpart in * the compara.genome_db table match the corresponding core databases. */ public boolean checkTaxon(DatabaseRegistryEntry comparaDbre) { boolean result = true; Connection comparaCon = comparaDbre.getConnection(); // Get list of species in compara Vector<Species> comparaSpecies = new Vector<Species>(); String sql = "SELECT DISTINCT genome_db.name FROM genome_db WHERE first_release IS NOT NULL AND last_release IS NULL" + " AND name <> 'ancestral_sequences'"; try { Statement stmt = comparaCon.createStatement(); ResultSet rs = stmt.executeQuery(sql); while (rs.next()) { comparaSpecies.add(Species.resolveAlias(rs.getString(1).toLowerCase().replace(' ', '_'))); } rs.close(); stmt.close(); } catch (Exception e) { e.printStackTrace(); } //Check that don't have duplicate entries in the ncbi_taxa_name table String useful_sql = "SELECT taxon_id,name,name_class,count(*) FROM ncbi_taxa_name GROUP BY taxon_id,name,name_class HAVING count(*) > 1;"; String[] failures = DBUtils.getColumnValues(comparaCon, useful_sql); if (failures.length > 0) { ReportManager.problem(this, comparaCon, "FAILED ncbi_taxa_name contains duplicate entries "); ReportManager.problem(this, comparaCon, "FAILURE DETAILS: There are " + failures.length + " ncbi_taxa_names with more than 1 entry"); ReportManager.problem(this, comparaCon, "USEFUL SQL: " + useful_sql); result = false; } else { result = true; } Map<Species, DatabaseRegistryEntry> speciesMap = getSpeciesCoreDbMap(DBUtils.getMainDatabaseRegistry()); boolean allSpeciesFound = true; for (Species species: comparaSpecies) { if (speciesMap.containsKey(species)) { Connection speciesCon = speciesMap.get(species).getConnection(); String sql1, sql2; /* Get taxon_id */ String taxon_id = DBUtils.getRowColumnValue(speciesCon, "SELECT meta_value FROM meta WHERE meta_key = \"species.taxonomy_id\""); /* Check name ++ compara scientific name := last two entries in the species classification in the core meta table */ sql1 = "SELECT \"name\", name " + " FROM ncbi_taxa_name WHERE name_class = \"scientific name\" AND taxon_id = " + taxon_id; sql2 = "SELECT \"name\", meta_value " + " FROM meta WHERE meta_key = \"species.classification\" ORDER BY meta_id LIMIT 1"; result &= compareQueries(comparaCon, sql1, speciesCon, sql2); /* Check scientific name in compara and core meta */ sql1 = "SELECT \"scientific name\", name " + " FROM ncbi_taxa_name WHERE name_class = \"scientific name\" AND taxon_id = " + taxon_id; sql2 = "SELECT \"scientific name\", meta_value " + " FROM meta WHERE meta_key = \"species.scientific_name\""; result &= compareQueries(comparaCon, sql1, speciesCon, sql2); /* Check classification */ /* This check is quite complex as the axonomy is stored in very different ways in compara and core DBs. In compara, the tree structure is stored in the ncbi_taxa_node table while the names are in the ncbi_taxa_name table. In the core DB, the taxonomy is stored in the meta table as values of the key "species.classification" and they should be sorted by meta_id. In the core DB, only the abbreviated lineage is described which means that we have to ignore ncbi_taxa_node with the genbank_hidden_flag set. On top of that, we want to compare the classification in one single SQL. Therefore, we are getting the results recursivelly and then execute a dumb SQL query with result itself */ String comparaClassification = ""; String values1[] = DBUtils.getRowValues(comparaCon, "SELECT rank, parent_id, genbank_hidden_flag FROM ncbi_taxa_node WHERE taxon_id = " + taxon_id); if (values1.length == 0) { /* if no rows are fetched, this taxon is missing from compara DB */ ReportManager.problem(this, comparaCon, "No taxon for " + species.toString()); } else { String this_taxon_id = values1[1]; while (!this_taxon_id.equals("0")) { values1 = DBUtils.getRowValues(comparaCon, "SELECT rank, parent_id, genbank_hidden_flag FROM ncbi_taxa_node WHERE taxon_id = " + this_taxon_id); if ( // values1[2].equals("0") && // we used to filter out entries with genbank_hidden_flag, we don't anymore !values1[1].equals("1") && !values1[1].equals("0") && !values1[0].equals("subgenus") && !values1[0].equals("genus") && !values1[0].equals("species subgroup") && !values1[0].equals("species group") ) { String taxonName = DBUtils.getRowColumnValue(comparaCon, "SELECT name FROM ncbi_taxa_name " + "WHERE name_class = \"scientific name\" AND taxon_id = " + this_taxon_id); comparaClassification += " " + taxonName; } this_taxon_id = values1[1]; } sql1 = "SELECT \"classification\", \"" + comparaClassification + "\""; /* It will be much better to run this using GROUP_CONCAT() but our MySQL server does not support it yet */ sql2 = "SELECT \"classification\", \""; String[] values2 = DBUtils.getColumnValues(speciesCon, "SELECT meta_value FROM meta WHERE meta_key = \"species.classification\"" + " ORDER BY meta_id"); /* Skip first value as it is part of the species name and not the lineage */ for (int a = 1; a < values2.length; a++) { sql2 += " " + values2[a]; } sql2 += "\""; result &= compareQueries(comparaCon, sql1, speciesCon, sql2); } } else { ReportManager.problem(this, comparaCon, "No connection for " + species.toString()); allSpeciesFound = false; } } return result; } } // CheckTopLevelDnaFrag