/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ensembl.healthcheck.testcase.compara;
import java.sql.Connection;
import java.util.Arrays;
import java.util.ArrayList;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase;
import org.ensembl.healthcheck.util.DBUtils;
/**
* An EnsEMBL Healthcheck test case that looks for chromosomes
* missing synteny
*/
public class CheckSyntenySanity extends SingleDatabaseTestCase {
/**
* Create an CheckSynteny that applies to a specific set of databases.
*/
public CheckSyntenySanity() {
setDescription("Check for missing syntenies in the compara database.");
setTeamResponsible(Team.COMPARA);
}
/**
* Run the test.
*
* @param dbre
* The database to use.
* @return true if the test passed.
*
*/
public boolean run(DatabaseRegistryEntry dbre) {
boolean result = true;
Connection con = dbre.getConnection();
if (!tableHasRows(con, "synteny_region")) {
ReportManager.problem(this, con,
"NO ENTRIES in the synteny_region table");
} else if (!tableHasRows(con, "dnafrag_region")) {
ReportManager.problem(this, con,
"NO ENTRIES in the dnafrag_region table");
} else if (!tableHasRows(con, "dnafrag")) {
ReportManager.problem(this, con, "NO ENTRIES in the dnafrag table");
} else {
for (String this_mlss_id : get_all_method_link_species_set_ids(con)) {
result &= check_this_synteny(con, this_mlss_id);
}
}
return result;
}
private ArrayList<String> get_all_method_link_species_set_ids(Connection con) {
ArrayList<String> method_link_species_set_ids = new ArrayList<String>();
String[] method_link_ids = DBUtils
.getColumnValues(
con,
"SELECT method_link_id FROM method_link WHERE class LIKE 'SyntenyRegion%' OR type = 'SYNTENY'");
for (String method_link_id : method_link_ids) {
String[] these_method_link_ids = DBUtils
.getColumnValues(
con,
"SELECT method_link_species_set_id FROM method_link_species_set WHERE method_link_id = "
+ method_link_id);
method_link_species_set_ids.addAll(Arrays.asList(these_method_link_ids));
}
return method_link_species_set_ids;
}
private boolean check_this_synteny(Connection con,
String method_link_species_set_id) {
boolean result = true;
String[] genome_db_ids = DBUtils
.getColumnValues(
con,
"SELECT genome_db_id FROM method_link_species_set LEFT JOIN species_set"
+ " USING (species_set_id) WHERE method_link_species_set_id = "
+ method_link_species_set_id);
/**
* Looks for method_link_species_sets of GenomicAlignBlocks using the
* same species set.
*
* If no synteny regions can be found for the method_link_species_set,
* then the genomic align blocks will be checked.
*
*/
String[] alignment_mlss_ids = DBUtils
.getColumnValues(
con,
"SELECT mlss2.method_link_species_set_id FROM method_link_species_set mlss1,"
+ " method_link_species_set mlss2, method_link ml WHERE mlss1.method_link_species_set_id = "
+ method_link_species_set_id
+ " AND mlss1.species_set_id = mlss2.species_set_id"
+ " AND mlss2.method_link_id = ml.method_link_id AND ml.class like 'GenomicAlignBlock%'");
for (String genome_db_id : genome_db_ids) {
String genome_db_name = DBUtils.getRowColumnValue(con,
"SELECT name FROM genome_db " + " WHERE genome_db_id = "
+ genome_db_id);
/**
* Get ids of dna_frags that are longer that 1Mb. The 'NOT LIKE'
* bits exclude coord systems like
*
* - unknown_singleton
* - unknown_group and
* - chromosome_group.
*
*/
String[] these_dnafrag_ids = DBUtils.getColumnValues(con,
"SELECT dnafrag_id FROM dnafrag WHERE genome_db_id = "
+ genome_db_id
+ " AND coord_system_name IN ('chromosome', 'group')"
+ " AND name NOT LIKE '%\\_%'"
+ " AND name NOT LIKE '%Un%'"
+ " AND name NOT IN ('MT') AND length > 1000000");
for (String dnafrag_id : these_dnafrag_ids) {
/**
* count is the number of synteny regions that are on the
* dnafrag tested in this iteration of the loop that belong
* to the method_link_species_set being tested in this call
* of the method.
*
*/
int count = DBUtils
.getRowCountFast(
con,
"SELECT count(*) FROM synteny_region "
+ " LEFT JOIN dnafrag_region USING (synteny_region_id) WHERE"
+ " method_link_species_set_id = "
+ method_link_species_set_id
+ " AND dnafrag_id = "
+ dnafrag_id);
/*
* If synteny regions were found, this is ok, otherwise check
* alignments from genomic align blocks.
*
*/
if (count == 0) {
int aln_count = 0;
String aln_name = "";
String aln_mlss_id = "";
for (String alignment_mlss_id : alignment_mlss_ids) {
/**
* Name of the dna frag with the greatest amount of
* genomic alignment blocks that make hits on other
* dna frags (foreign alignments) and how many such
* genomic alignment blocks exist on this dna frag.
*/
String[] aln_result = DBUtils
.getRowValues(
con,
"SELECT dnafrag.name, count(*) FROM"
+ " genomic_align ga1 LEFT JOIN genomic_align ga2 USING (genomic_align_block_id)"
+ " LEFT JOIN dnafrag ON (ga2.dnafrag_id = dnafrag.dnafrag_id) WHERE"
+ " ga1.dnafrag_id = "
+ dnafrag_id
+ " AND dnafrag.coord_system_name IN ('chromosome', 'group')"
+ " AND ga1.method_link_species_set_id = "
+ alignment_mlss_id
+ " AND ga1.dnafrag_id <> ga2.dnafrag_id GROUP BY ga2.dnafrag_id "
+ " ORDER BY count(*) DESC LIMIT 1");
if (aln_result.length > 0
&& Integer.valueOf(aln_result[1]).intValue() > aln_count) {
aln_count = Integer.valueOf(aln_result[1])
.intValue();
aln_name = aln_result[0];
aln_mlss_id = alignment_mlss_id;
}
}
/*
* If a dna_frag has more than 1000 foreign alignments,
* this is reported as an error.
*/
if (aln_count > 1000) {
String dnafrag_name = DBUtils.getRowColumnValue(con,
"SELECT name FROM dnafrag "
+ " WHERE dnafrag_id = "
+ dnafrag_id);
String dnafrag_length = DBUtils.getRowColumnValue(con,
"SELECT length FROM dnafrag "
+ " WHERE dnafrag_id = "
+ dnafrag_id);
ReportManager.problem(this, con, aln_count
+ " alignments to " + genome_db_name + " chr."
+ dnafrag_name + " and no syntenies for MLSS "
+ method_link_species_set_id);
result = false;
}
}
}
}
return result;
}
} // CheckHomology