/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ensembl.healthcheck.testcase.compara;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.testcase.compara.AbstractComparaTestCase;
import org.ensembl.healthcheck.util.DBUtils;
/**
* An EnsEMBL Healthcheck test case that looks for broken entries in the
* method_link_species_set table
*/
public class CheckMethodLinkSpeciesSetTable extends AbstractComparaTestCase {
public CheckMethodLinkSpeciesSetTable() {
setDescription("Check for broken entries in the method_link_species_set table.");
setTeamResponsible(Team.COMPARA);
}
public boolean run(DatabaseRegistryEntry dbre) {
Connection con = dbre.getConnection();
boolean result = true;
/* Check number of MLSS with no source */
result &= checkCountIsZero(con, "method_link_species_set", "source = 'NULL' OR source IS NULL");
/* Check number of MLSS with no name */
result &= checkCountIsZero(con, "method_link_species_set", "name = 'NULL' OR name IS NULL");
/* Check the genomes in the species_set linked to the MLSS table */
int numOfGenomesInTheDatabase = DBUtils.getRowCount(con, "SELECT count(*) FROM genome_db WHERE taxon_id > 0");
Pattern unaryPattern = Pattern.compile("^([A-Z]\\.[a-z0-9]{2,3}) ");
Pattern binaryPattern = Pattern.compile("^([A-Z]\\.[a-z0-9]{2,3})-([A-Z]\\.[a-z0-9]{2,3})");
Pattern multiPattern = Pattern.compile("([0-9]+)");
Pattern ssnamePattern = Pattern.compile("^([a-zA-Z]+) ");
/* Query returns the MLLS.name, the number of genomes and their name ("H.sap" format) */
String sql = "SELECT method_link_species_set.name, count(*),"+
" GROUP_CONCAT( CONCAT( UPPER(substr(genome_db.name, 1, 1)), '.', SUBSTR(SUBSTRING_INDEX(genome_db.name, '_', -1),1,3) ) ), "+
" species_set_id, species_set_header.name, "+
" method_link_species_set_id "+
" FROM method_link_species_set JOIN species_set USING (species_set_id)"+
" JOIN species_set_header USING (species_set_id)"+
" JOIN genome_db USING (genome_db_id) GROUP BY method_link_species_set_id";
try {
Statement stmt = con.createStatement();
ResultSet rs = stmt.executeQuery(sql);
if (rs != null) {
while (rs.next()) {
String name = rs.getString(1);
int num = rs.getInt(2);
String genomes = rs.getString(3);
String ss_id = rs.getString(4);
String ss_name = rs.getString(5);
String mlss_id = rs.getString(6);
Matcher unaryMatcher = unaryPattern.matcher(name);
Matcher binaryMatcher = binaryPattern.matcher(name);
Matcher multiMatcher = multiPattern.matcher(name);
Matcher ssnameMatcher = ssnamePattern.matcher(name);
if (unaryMatcher.find()) {
if (num != 1) {
ReportManager.problem(this, con, "FAILED species_set(" + ss_id + ") for \"" + name + "\"(" + mlss_id + ") links to " + num + " genomes instead of 1");
result = false;
}
if (!genomes.equals(unaryMatcher.group(1))) {
ReportManager.problem(this, con, "FAILED species_set(" + ss_id + ") for \"" + name + "\"(" + mlss_id + ") links to " + genomes);
}
} else if (binaryMatcher.find()) {
if (num != 2 && binaryMatcher.group(1) != binaryMatcher.group(2)) {
ReportManager.problem(this, con, "FAILED species_set(" + ss_id + ") for \"" + name + "\"(" + mlss_id + ") links to " + num + " genomes instead of 2");
result = false;
}
if (!genomes.equals(binaryMatcher.group(1)+ "," + binaryMatcher.group(2)) && !genomes.equals(binaryMatcher.group(2) + "," + binaryMatcher.group(1))) {
if (binaryMatcher.group(1).equals (binaryMatcher.group(2)) && genomes.equals(binaryMatcher.group(1))) {
} else {
ReportManager.problem(this, con, "Yes, we felt here... FAILED species_set(" + ss_id + ") for \"" + name + "\"(" + mlss_id + ") links to " + genomes + " instead of " + binaryMatcher.group(1) + "," + binaryMatcher.group(2));
}
}
} else if (multiMatcher.find()) {
if (num != Integer.valueOf(multiMatcher.group()).intValue()) {
ReportManager.problem(this, con, "FAILED species_set(" + ss_id + ") for \"" + name + "\"(" + mlss_id + ") links to " + num + " genomes instead of " + multiMatcher.group());
result = false;
}
} else if (ssnameMatcher.find()) {
if (ssnameMatcher.group(1).equals("protein") || ssnameMatcher.group(1).equals("nc") || ssnameMatcher.group(1).equals("species")) {
ReportManager.info(this, con, "\"" + name + "\"(" + mlss_id + ") is named using the old convention (the collection name is missing)");
} else if (!ss_name.equals("collection-" + ssnameMatcher.group(1))) {
ReportManager.problem(this, con, "FAILED species_set(" + ss_id + ") for \"" + name + "\"(" + mlss_id + ") does not start with the species-set name " + ss_name);
result = false;
}
} else if (num != numOfGenomesInTheDatabase && !isMasterDB(dbre.getConnection())) {
ReportManager.problem(this, con, "FAILED species_set(" + ss_id + ") for \"" + name + "\"(" + mlss_id + ") links to " + num + " genomes instead of " + numOfGenomesInTheDatabase);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
} // CheckMethodLinkSpeciesSetTable