/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ensembl.healthcheck.testcase.compara;
import java.sql.Connection;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase;
import org.ensembl.healthcheck.util.DBUtils;
/**
* An EnsEMBL Healthcheck test case that checks the conservation_score table
*/
public class CheckConservationScoreSanity extends SingleDatabaseTestCase {
/**
* Create an CheckConservationScoreSanity that applies to a specific set of
* databases.
*/
public CheckConservationScoreSanity() {
setDescription("Check the conservation_score table in ensembl_compara databases.");
setTeamResponsible(Team.COMPARA);
}
/**
* Run the test.
*
* @param dbre
* The database to use.
* @return true if the test passed.
*
*/
public boolean run(DatabaseRegistryEntry dbre) {
boolean result = true;
Connection con = dbre.getConnection();
/**
* Get all method_link_species_set_ids for method_link type of
* GERP_CONSERVATION_SCORE
*/
String[] method_link_species_set_ids = DBUtils.getColumnValues(con, "SELECT method_link_species_set_id FROM method_link_species_set LEFT JOIN method_link USING (method_link_id) WHERE type=\"GERP_CONSERVATION_SCORE\" OR class LIKE \"ConservationScore%\"");
/**
* Get Ancestral sequences genome_db_id
*/
String ancestral_seq_id = DBUtils.getRowColumnValue(con, "SELECT genome_db_id FROM genome_db WHERE name = \"ancestral_sequences\"");
if (method_link_species_set_ids.length > 0) {
for(String mlss_id : method_link_species_set_ids) {
// Get the mlss_id for the associated multiple alignment
String multi_align_mlss_id = DBUtils.getRowColumnValue(con, "SELECT value FROM method_link_species_set_tag WHERE tag=\"msa_mlss_id\" AND method_link_species_set_id=" + mlss_id);
if (multi_align_mlss_id == "") {
ReportManager.problem(this, con, "There is no msa_mlss_id tag for the GERP mlss" + mlss_id + "\n");
} else {
/**
* Find the multiple alignments gabs which have more than 3
* species but don't have any conservation scores Need to
* exclude gabs containing ancestral sequences
*/
String useful_sql;
if (ancestral_seq_id == "") {
useful_sql = "SELECT genomic_align_block.genomic_align_block_id FROM genomic_align_block LEFT JOIN genomic_align USING (genomic_align_block_id) LEFT JOIN conservation_score USING (genomic_align_block_id) WHERE genomic_align_block.method_link_species_set_id = "
+ multi_align_mlss_id
+ " AND conservation_score.genomic_align_block_id IS NULL GROUP BY genomic_align_block.genomic_align_block_id HAVING count(*) > 3";
} else {
useful_sql = "SELECT genomic_align_block.genomic_align_block_id FROM genomic_align_block LEFT JOIN conservation_score USING (genomic_align_block_id) LEFT JOIN genomic_align USING (genomic_align_block_id) LEFT JOIN dnafrag USING (dnafrag_id) WHERE genomic_align_block.method_link_species_set_id = "
+ multi_align_mlss_id
+ " AND conservation_score.genomic_align_block_id IS NULL AND genome_db_id <> "
+ ancestral_seq_id
+ " GROUP BY genomic_align_block.genomic_align_block_id HAVING count(*) > 3";
}
String[] failures = DBUtils
.getColumnValues(con, useful_sql);
if (failures.length > 0) {
/**
* Warning if there are blocks with 4 genomes because it
* is possible to have (human, chimp, rhesus) and one of
* cow or dog and still not get above the min_rej_sub
* score (default=0.5)
*/
String useful_sql4 = "SELECT genomic_align_block.genomic_align_block_id FROM genomic_align_block LEFT JOIN genomic_align USING (genomic_align_block_id) LEFT JOIN conservation_score USING (genomic_align_block_id) WHERE genomic_align_block.method_link_species_set_id = "
+ multi_align_mlss_id
+ " AND conservation_score.genomic_align_block_id IS NULL GROUP BY genomic_align_block.genomic_align_block_id HAVING count(*) = 4";
String[] failures4 = DBUtils.getColumnValues(con,
useful_sql4);
if (failures.length == failures4.length) {
ReportManager.problem(this, con, "WARNING conservation_score -> multiple alignments which have more than 3 species but don't have any conservation scores");
ReportManager.problem(this, con, "WARNING DETAILS: There are " + failures.length + " blocks (mlss= " + multi_align_mlss_id
+ ") with 4 seqs and no conservation score! Must check that the sum of the branch lengths of these 4 species is less than 0.5 (min_neu_evol). If it is greater than 0.5, there is a problem that needs fixing!");
ReportManager.problem(this, con, "USEFUL SQL: " + useful_sql4);
} else {
ReportManager.problem(this, con, "FAILED conservation_score -> multiple alignments which have more than 3 species but don't have any conservation scores");
ReportManager.problem(this, con, "FAILURE DETAILS: There are " + failures.length + " blocks (mlss= " + multi_align_mlss_id + ") with more than 4 seqs and no conservation score!");
ReportManager.problem(this, con, "USEFUL SQL: " + useful_sql);
result = false;
}
}
}
}
}
return result;
}
} // CheckConservationScoreSanity