/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ensembl.healthcheck.testcase.compara;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Map;
import java.util.Vector;
import org.ensembl.healthcheck.DatabaseRegistry;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.DatabaseType;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Species;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.testcase.compara.AbstractComparaTestCase;
import org.ensembl.healthcheck.util.DBUtils;
/**
* Check dnafrag table against core databases.
*/
public class CheckTopLevelDnaFrag extends AbstractComparaTestCase {
/**
* Create a new instance of MetaCrossSpecies
*/
public CheckTopLevelDnaFrag() {
setDescription("Check that every dnafrag corresponds to a top_level seq_region in the core DB and vice versa.");
setTeamResponsible(Team.COMPARA);
}
/**
* Check that every dnafrag corresponds to a top_level seq_region in the core DB
* and vice versa.
* NB: A warning message is displayed if some dnafrags cannot be checked because
* there is not any connection to the corresponding core database.
*
* @param comparaDbre
* The database registry containing all the specified databases.
* @return true if the all the dnafrags are top_level seq_regions in their corresponding
* core database.
*/
public boolean run(DatabaseRegistryEntry comparaDbre) {
boolean result = true;
result &= checkTopLevelDnaFrag(comparaDbre);
return result;
}
public boolean checkTopLevelDnaFrag(DatabaseRegistryEntry comparaDbre) {
boolean result = true;
Connection comparaCon = comparaDbre.getConnection();
// Get list of species in compara
Vector<Species> comparaSpecies = new Vector<Species>();
String sql = "SELECT DISTINCT genome_db.name FROM genome_db WHERE first_release IS NOT NULL AND last_release IS NULL"
+ " AND name <> 'ancestral_sequences'";
try {
Statement stmt = comparaCon.createStatement();
ResultSet rs = stmt.executeQuery(sql);
while (rs.next()) {
comparaSpecies.add(Species.resolveAlias(rs.getString(1).toLowerCase().replace(' ', '_')));
}
rs.close();
stmt.close();
} catch (Exception e) {
e.printStackTrace();
}
Map<Species, DatabaseRegistryEntry> speciesMap = getSpeciesCoreDbMap(DBUtils.getMainDatabaseRegistry());
String speciesNotFound = "";
for (Species species : comparaSpecies) {
if (speciesMap.containsKey(species)) {
Connection speciesCon = speciesMap.get(species).getConnection();
int maxRows = 50000;
int rows = DBUtils.getRowCount(comparaCon, "SELECT COUNT(*) FROM" +
" dnafrag LEFT JOIN genome_db USING (genome_db_id)" +
" WHERE genome_db.name = \"" + species + "\" AND first_release IS NOT NULL AND last_release IS NULL");
if (rows > maxRows) {
// Divide and conquer approach for large sets
for (int rowCount=0; rowCount<rows; rowCount+=maxRows) {
String sql1 = "SELECT dnafrag.coord_system_name, dnafrag.name, CONCAT('length=', dnafrag.length), CONCAT('is_ref=', dnafrag.is_reference)" +
" FROM dnafrag LEFT JOIN genome_db USING (genome_db_id)" +
" WHERE genome_db.name = \"" + species + "\" AND first_release IS NOT NULL AND last_release IS NULL" +
" ORDER BY (dnafrag.name)" +
" LIMIT " + rowCount + ", " + maxRows;
String sql2 = "SELECT coord_system.name, seq_region.name, CONCAT('length=', seq_region.length),"+
" CONCAT('is_ref=', IF(non_ref_seq_region.seq_region_id is not null, 0, 1))" +
" FROM seq_region" +
" JOIN coord_system USING (coord_system_id)" +
" JOIN seq_region_attrib USING (seq_region_id)" +
" JOIN attrib_type USING (attrib_type_id)" +
" LEFT JOIN (SELECT seq_region_id FROM seq_region_attrib JOIN attrib_type USING (attrib_type_id) WHERE attrib_type.code = 'non_ref') non_ref_seq_region USING (seq_region_id)" +
" WHERE attrib_type.code = 'toplevel'" +
" ORDER BY (seq_region.name)" +
" LIMIT " + rowCount + ", " + maxRows;
result &= compareQueries(comparaCon, sql1, speciesCon, sql2);
}
} else {
String sql1 = "SELECT dnafrag.coord_system_name, dnafrag.name, CONCAT('length=', dnafrag.length), CONCAT('is_ref=', dnafrag.is_reference)" +
" FROM dnafrag LEFT JOIN genome_db USING (genome_db_id)" +
" WHERE genome_db.name = \"" + species + "\" AND first_release IS NOT NULL AND last_release IS NULL";
String sql2 = "SELECT coord_system.name, seq_region.name, CONCAT('length=', seq_region.length),"+
" CONCAT('is_ref=', IF(non_ref_seq_region.seq_region_id is not null, 0, 1))" +
" FROM seq_region" +
" JOIN coord_system USING (coord_system_id)" +
" JOIN seq_region_attrib USING (seq_region_id)" +
" JOIN attrib_type USING (attrib_type_id)" +
" LEFT JOIN (SELECT seq_region_id FROM seq_region_attrib JOIN attrib_type USING (attrib_type_id) WHERE attrib_type.code = 'non_ref') non_ref_seq_region USING (seq_region_id)" +
" WHERE attrib_type.code = 'toplevel'";
result &= compareQueries(comparaCon, sql1, speciesCon, sql2);
}
} else {
// This will trigger the warning about missing species
if (speciesNotFound == "") {
speciesNotFound = "" + species;
} else {
speciesNotFound += ", " + species;
}
}
}
// Warning about missing species
if (speciesNotFound != "") {
ReportManager.problem(this, comparaCon, "No connection for " + speciesNotFound);
}
return result;
}
} // CheckTopLevelDnaFrag