/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ensembl.healthcheck.testcase.compara;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Pattern;
import org.ensembl.healthcheck.DatabaseRegistry;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.DatabaseType;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Species;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.testcase.compara.AbstractComparaTestCase;
import org.ensembl.healthcheck.util.DBUtils;
/**
* Check compara genome_db table against core meta one.
*/
public class CheckGenomeDB extends AbstractComparaTestCase {
/**
* Create a new instance of MetaCrossSpecies
*/
public CheckGenomeDB() {
setDescription("Check that the properties of the genome_db table (taxon_id, assembly" +
" and genebuild) correspond to the meta data in the core DB and vice versa.");
setTeamResponsible(Team.COMPARA);
}
/**
* Check that the properties of the genome_db table (taxon_id, assembly and genebuild)
* correspond to the meta data in the core DB and vice versa.
* NB: A warning message is displayed if some dnafrags cannot be checked because
* there is not any connection to the corresponding core database.
*
* @param comparaDbre
* The database registry containing all the specified databases.
* @return true if the all the dnafrags are top_level seq_regions in their corresponding
* core database.
*/
public boolean run(final DatabaseRegistryEntry comparaDbre) {
boolean result = true;
result &= checkAssemblies(comparaDbre);
result &= checkGenomeDB(comparaDbre);
result &= checkCountIsZero(comparaDbre.getConnection(), "genome_db", "locator IS NOT NULL");
return result;
}
public boolean checkAssemblies(DatabaseRegistryEntry comparaDbre) {
boolean result = true;
Connection comparaCon = comparaDbre.getConnection();
String comparaDbName = (comparaCon == null) ? "no_database" : DBUtils.getShortDatabaseName(comparaCon);
// Get list of species with more than 1 default assembly
String sql = "SELECT DISTINCT genome_db.name FROM genome_db WHERE first_release IS NOT NULL AND last_release IS NULL"
+ " GROUP BY name HAVING count(*) <> 1";
List<String[]> data = DBUtils.getRowValuesList(comparaCon, sql);
for (String[] line : data) {
ReportManager.problem(this, comparaCon, "There are more than 1 current assembly for " + line[0]);
result = false;
}
boolean is_master_db = isMasterDB(comparaCon);
// Get list of species with a non-default assembly
if (!isMasterDB(comparaCon)) {
sql = "SELECT DISTINCT name FROM genome_db WHERE first_release IS NULL OR last_release IS NOT NULL";
data = DBUtils.getRowValuesList(comparaCon, sql);
for (String[] line : data) {
ReportManager.problem(this, comparaCon, comparaDbName + " There is at least one non-current assembly for " + line[0] + " (this should not happen in the release DB)");
result = false;
}
} else {
// Get list of species with no default assembly
sql = "SELECT DISTINCT name FROM genome_db GROUP BY name HAVING SUM(first_release IS NOT NULL AND last_release IS NULL) = 0";
data = DBUtils.getRowValuesList(comparaCon, sql);
for (String[] line : data) {
ReportManager.info(this, comparaCon, "There is no default assembly for " + line[0]);
}
}
return result;
}
public boolean checkGenomeDB(DatabaseRegistryEntry comparaDbre) {
boolean result = true;
Connection comparaCon = comparaDbre.getConnection();
// Get list of species in compara
Vector<Species> comparaSpecies = new Vector<Species>();
String sql = "SELECT DISTINCT genome_db.name FROM genome_db WHERE first_release IS NOT NULL AND last_release IS NULL"
+ " AND name <> 'ancestral_sequences'";
List<String[]> data = DBUtils.getRowValuesList(comparaCon, sql);
for (String[] line : data) {
Species species = Species.resolveAlias(line[0].toLowerCase().replace(' ', '_'));
if (species.toString().equals("unknown")) {
ReportManager.problem(this, comparaCon, "No species defined for " + line[0] + " in org.ensembl.healthcheck.Species");
} else {
comparaSpecies.add(species);
}
}
Map<Species, DatabaseRegistryEntry> speciesMap = getSpeciesCoreDbMap(DBUtils.getMainDatabaseRegistry());
boolean allSpeciesFound = true;
for (Species species: comparaSpecies) {
if (speciesMap.containsKey(species)) {
Connection speciesCon = speciesMap.get(species).getConnection();
/* Check production name */
String sql1, sql2;
sql1 = "SELECT \"" + species + "\", \"name\", name FROM genome_db" +
" WHERE genome_db.name = \"" + species + "\" AND first_release IS NOT NULL AND last_release IS NULL";
sql2 = "SELECT \"" + species + "\", \"name\", meta_value FROM meta" +
" WHERE meta_key = \"species.production_name\"";
result &= compareQueries(comparaCon, sql1, speciesCon, sql2);
/* Check taxon_id */
sql1 = "SELECT \"" + species + "\", \"taxon_id\", taxon_id FROM genome_db" +
" WHERE genome_db.name = \"" + species + "\" AND first_release IS NOT NULL AND last_release IS NULL";
sql2 = "SELECT \"" + species + "\", \"taxon_id\", meta_value FROM meta" +
" WHERE meta_key = \"species.taxonomy_id\"";
result &= compareQueries(comparaCon, sql1, speciesCon, sql2);
/* Check assembly */
sql1 = "SELECT \"" + species + "\", \"assembly\", assembly FROM genome_db" +
" WHERE genome_db.name = \"" + species + "\" AND first_release IS NOT NULL AND last_release IS NULL";
sql2 = "SELECT \"" + species + "\", \"assembly\", version FROM coord_system" +
" WHERE rank=1";
result &= compareQueries(comparaCon, sql1, speciesCon, sql2);
/* Check genebuild */
sql1 = "SELECT \"" + species + "\", \"genebuild\", genebuild FROM genome_db" +
" WHERE genome_db.name = \"" + species + "\" AND first_release IS NOT NULL AND last_release IS NULL";
sql2 = "SELECT \"" + species + "\", \"genebuild\", meta_value FROM meta" +
" WHERE meta_key = \"genebuild.start_date\"";
result &= compareQueries(comparaCon, sql1, speciesCon, sql2);
} else {
ReportManager.problem(this, comparaCon, "No connection for " + species);
allSpeciesFound = false;
}
}
return result;
}
} // CheckGenomeDB