/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ensembl.healthcheck.testcase.funcgen;
import java.sql.Connection;
import java.util.HashMap;
import java.util.Map;
import java.util.Iterator;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Arrays;
import java.util.ArrayList;
import java.sql.Array;
import java.util.regex.Pattern;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.DatabaseType;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.testcase.Priority;
import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase;
import org.ensembl.healthcheck.util.DBUtils;
public class RegulatorySets extends SingleDatabaseTestCase {
/**
* Create a new instance
*/
public RegulatorySets() {
addToGroup("post_regulatorybuild");
addToGroup("funcgen");//do we need this group and the funcgen-release group?
addToGroup("funcgen-release");
setTeamResponsible(Team.FUNCGEN);
setDescription("Checks if sets have appropriate associations and statues entries");
setPriority(Priority.AMBER);
setEffect("Displays may fail or omit some data");
setFix("Run update_DB_for_release (in fix mode) or run suggested USEFUL SQL");
}
/**
* This only applies to funcgen databases.
*/
public void types() {
//Do we really need these removes?
removeAppliesToType(DatabaseType.OTHERFEATURES);
removeAppliesToType(DatabaseType.CDNA);
removeAppliesToType(DatabaseType.CORE);
removeAppliesToType(DatabaseType.VARIATION);
removeAppliesToType(DatabaseType.COMPARA);
}
/**
* Run the test.
* We will check for all sets associated with regulatory build have appropriate meta_keys, supporting_sets
* and status entries
*
* @param dbre
* The database to use.
* @return true if the test passed.
*
*/
public boolean run(DatabaseRegistryEntry dbre) {
boolean result = true;
Connection efgCon = dbre.getConnection();
//Test for other meta keys here: reg build version?
HashMap fsetInfo = new HashMap<String, HashMap>();
String[] metaKeys = {"feature_set_ids", "focus_feature_set_ids", "feature_type_ids"};
try {
//This account for archived sets
int regSetCount = DBUtils.getRowCount(efgCon, "SELECT distinct(cell_type_id) from feature_set where type='regulatory'");
String[] cellTypes = new String[regSetCount];
int count = 0;
Statement stmt = efgCon.createStatement();
ResultSet rs = stmt.executeQuery("SELECT feature_set_id, name from feature_set where type='regulatory'");
while (rs.next()) {
HashMap fsInfo = new HashMap<String, HashMap>();
String fsetName = rs.getString("name");
// TEST FOR ARCHIVED SETS
//Need to do this for data_set table too?
if(fsetName.matches(".*_v[0-9]+$")) {
ReportManager.problem(this, efgCon, "Found archived regulatory FeatureSet:\n" +
fsetName + "\nUse rollback_experiment.pl to remove these");
result = false;
continue;
}
fsInfo.put("name", fsetName);
String cellType = fsetName.replaceAll("RegulatoryFeatures:", "");
fsInfo.put("cell_type", cellType);
cellTypes[count] = cellType;
// GET META_KEYS
for (int i=0; i < metaKeys.length; i++) {
String fullKey = "regbuild." + cellType + "." + metaKeys[i];
Statement stmtMeta = efgCon.createStatement();
ResultSet rsMeta = stmtMeta.executeQuery("SELECT string from regbuild_string where name='" +
fullKey + "'");
if(! rsMeta.next()){
ReportManager.problem(this, efgCon, "Found absent regbuild_string:\t" +
fullKey);
result = false;
}else{
fsInfo.put(metaKeys[i], rsMeta.getString("string"));
}
}
fsetInfo.put(rs.getString("feature_set_id"), fsInfo);
++count;
}
// TEST FOR OLD/ANOMALOUS META_KEYS
stmt = efgCon.createStatement();
rs = stmt.executeQuery("SELECT name from regbuild_string where name like 'regbuild%ids%'");
while (rs.next()) {
String metaKey = rs.getString("name");
if(metaKey.matches(".*_v[0-9]+$") ){
ReportManager.problem(this, efgCon, "Found archived regbuild_string:\t" +
metaKey);
result = false;
continue;
}
String metaTmp = metaKey.replaceAll("regbuild.", "");
String cellType = metaTmp.replaceAll("\\..*_ids", ""); //will this work without compiling a pattern
String keyType = metaTmp.replaceAll(cellType + ".*\\.", ""); //Will this escape properly?
if(! Arrays.asList(cellTypes).contains(cellType)){
ReportManager.problem(this, efgCon, "Found cell type regbuild_string which is not represented as a " +
"FeatureSet:\t" + metaKey);
result = false;
}
}
//DEAL WITH EACH BUILD SEPARATELY
//We already do most of this in HealthChecker.pm!
Iterator rfsetIt = fsetInfo.keySet().iterator();
while(rfsetIt.hasNext()){ //Regulatory FeatureSets
String fsetID = (String) rfsetIt.next();
HashMap fsInfo = (HashMap) fsetInfo.get(fsetID);
// Check RegFeat data_set
stmt = efgCon.createStatement();
rs = stmt.executeQuery("SELECT name, data_set_id from data_set where feature_set_id=" + fsetID);
if(! rs.next()){
ReportManager.problem(this, efgCon, "Found absent data_set:\t" + fsInfo.get("name"));
result = false;
continue; //while(fsetIt.hasNext())
}
// Set names matches?
if(! fsInfo.get("name").equals(rs.getString("name"))){
ReportManager.problem(this, efgCon, "Found name mismatch between FeatureSet " +
fsInfo.get("name") + " and linked DataSet " + rs.getString("name"));
result = false;
}
String dsetID = rs.getString("data_set_id");
//GET ALL SUPPORTING SET INFO
stmt = efgCon.createStatement();
ResultSet rsDsetSupport = stmt.executeQuery("SELECT ss.supporting_set_id as 'ss_feature_set_id', " +
"fs.feature_set_id as 'fs_feature_set_id', ds.data_set_id as 'ds_data_set_id', ss1.supporting_set_id as 'ss_result_set_id'" +
"from supporting_set ss left join (feature_set fs left join " +
"(data_set ds left join supporting_set ss1 on ds.data_set_id=ss1.data_set_id and ss1.type='result') " +
"on fs.feature_set_id=ds.feature_set_id) on fs.feature_set_id=ss.supporting_set_id " +
"where ss.type='feature' and ss.data_set_id=" + dsetID);
count = 0;
String[] metaFsetIDs = ((String) fsInfo.get("feature_set_ids")).split(",\\s*");
String metaFtypeIDString = (String) fsInfo.get("feature_type_ids");
String[] metaFtypeIDs = metaFtypeIDString.split(",\\s*");
//String[] ssFsetIDs = (String[])((Array) rsDsetSupport.getArray("ss_feature_set_id")).getArray();
//String[] fsFsetIDs = (String[])((Array) rsDsetSupport.getArray("fs_feature_set_id")).getArray();
//String[] dsDsetIDs = (String[])((Array) rsDsetSupport.getArray("ds_data_set_id")).getArray();
//String[] ssRsetIDs = (String[])((Array) rsDsetSupport.getArray("ss_result_set_id")).getArray();
ArrayList<String> ssFsetIDs = new ArrayList<String>();
ArrayList<String> fsFsetIDs = new ArrayList<String>();
ArrayList<String> dsDsetIDs = new ArrayList<String>();
ArrayList<String> ssRsetIDs = new ArrayList<String>();
while(rsDsetSupport.next()){
ssFsetIDs.add(rsDsetSupport.getString("ss_feature_set_id"));
fsFsetIDs.add(rsDsetSupport.getString("fs_feature_set_id"));
dsDsetIDs.add(rsDsetSupport.getString("ds_data_set_id"));
ssRsetIDs.add(rsDsetSupport.getString("ss_result_set_id"));
}
//CHECK META KEY feature_set_ids
boolean sqlSafe = true;
for(int i=0; i < metaFsetIDs.length; i++){
if(! ssFsetIDs.contains(metaFsetIDs[i])){
ReportManager.problem(this, efgCon, "Found feature_set_id in regbuild_string:\t" +
"regbuild." + fsInfo.get("cell_type") + ".feature_set_ids which is not " +
"present as a supporting_set_id for DataSet " + fsInfo.get("name"));
result = false;
sqlSafe = false;
continue;
}
++count;
//Check feature_type is correct
stmt = efgCon.createStatement();
ResultSet ssFtype = stmt.executeQuery("SELECT feature_type_id from feature_set " +
"where feature_set_id=" + metaFsetIDs[i]);
if(! ssFtype.next()){
//Need to test this as we have only data from meta and supporting_set so far!
ReportManager.problem(this, efgCon, "Found absent supporting FeatureSet from regbuild." +
fsInfo.get("cell_type") + ".feature_set_ids:\t" + metaFsetIDs[i]);
//This will also be reported in the CHECK SETS/STATES loop below
continue;
}
if(! ssFtype.getString("feature_type_id").equals(metaFtypeIDs[i])){
ReportManager.problem(this, efgCon, "Found mismatch between meta feature_set_id(" +
ssFtype.getString("feature_type_id") + ") and meta feature_type_id(" +
metaFtypeIDs[i] + ") for " + fsInfo.get("cell_type"));
result = false;
sqlSafe = false;
}
}
//CHECK META SIZE
if(ssFsetIDs.size() != count){
ReportManager.problem(this, efgCon, "");
result = false;
sqlSafe = false;
}
//CHECK META KEY focus_feature_set_ids
String[] metaFFsetIDs = ((String) fsInfo.get("focus_feature_set_ids")).split(",\\s*");
for(int i=0; i < metaFFsetIDs.length; i++){
if(metaFFsetIDs[i].equals("")){
//Now test whether any of the feature_type_ids are of core class
//ResultSet.getFetchSize() does not work for MySQL :/ Just returns 0
//If fetching ftype name would have to process the whole ResultSet before we can get thr true size
//Not that useful to list them, so just print some useful SQL and count instead
ResultSet ftypesRset = stmt.executeQuery("SELECT count(feature_type_id) as num_ftypes from feature_type where feature_type_id in("
+ metaFtypeIDString +
") and class in ('Open Chromatin', 'Transcription Factor' ,'Transcription Factor Complex')");
ftypesRset.next();
int numCoreFsets = ftypesRset.getInt("num_ftypes");//This just returns 0 anyway!!
if(numCoreFsets != 0){
//Useful SQL here?
//To list the offending feature sets or
//just to correct the regbuild strings?
//these should have been corrected by update_DB_for_release?
ReportManager.problem(this, efgCon, "Found empty regbuild." + fsInfo.get("cell_type")
+ ".focus_feature_set_ids regbuild_string. Is this really a projection build?");
result = false;
continue;
}
}
else{
if(! Arrays.asList(metaFsetIDs).contains(metaFFsetIDs[i])){
ReportManager.problem
(
this, efgCon, "Found feature_set_id(" + metaFFsetIDs[i] +
") in regbuild_string:\t" + "regbuild." + fsInfo.get("cell_type") +
".focus_feature_set_ids which is not present in regbuild."
+ fsInfo.get("cell_type") + ".feature_set_ids");
result = false;
//sqlSafe = false;// Is it just the focus key that is wrong?
//Will have already set this otherwise
continue;
}
}
}
//Could check length matches for MultiCell set?
//Could actually remove this meta_key for MultiCell as it should be the same feature_set_ids?
//CHECK SUPPORTING FEATURE/DATA/RESULT SETS, STATES AND DBFILE_REGISTRY
String usefulSQL = "";
String[] dsetStates = {"DISPLAYABLE"};
//my @rset_states = (@dset_states, 'DAS_DISPLAYABLE', $imp_cs_status);
//No method on basic [] array to add other arrays elements in assigment
//The declaration code expects a String and will not interpolate an String[]
//as separate Strings
//String[] rsetStates = {Arrays.asList(dsetStates)., };
//Need to get current CS name here for IMPORTED_'CS_NAME' status
String[] rsetStates = {"DISPLAYABLE"}; //Conditional test for RESULT_FEATURE_SET is below
String[] fsetStates = {"DISPLAYABLE", "MART_DISPLAYABLE"};
String rsetStatesString = Arrays.toString(rsetStates).replaceAll("[\\[\\]]", "'");
String fsetStatesString = Arrays.toString(fsetStates).replaceAll("[\\[\\]]", "'");
String dsetStatesString = Arrays.toString(dsetStates).replaceAll("[\\[\\]]", "'");
rsetStatesString = rsetStatesString.replaceAll(", ", "', '");
fsetStatesString = fsetStatesString.replaceAll(", ", "', '");
dsetStatesString = dsetStatesString.replaceAll(", ", "', '");
//String[] windowSizes = {};//leave file test this to Collection test?
ArrayList<String> absentStates = new ArrayList<String>();
//Could these usefulSql status commands be using IDs which are not valid or null?
//Yes these are unsafe until the the meta_keys/supporting_sets are corrected!
//Change INSERT IGNORE into just select to encourage the HC checker to look at the output first?
//Set up ArrayLists so we can report once for each set after the main loop
ArrayList<String> problemSupportingFsets = new ArrayList<String>();
ArrayList<String> problemSupportingDsets = new ArrayList<String>();
ArrayList<String> problemSupportingRsets = new ArrayList<String>();
for(int i=0; i < fsFsetIDs.size(); i++){
if(fsFsetIDs.get(i) == null){ //fset check
ReportManager.problem(this, efgCon, "RegulatoryFeatures:" + fsInfo.get("cell_type") +
" has absent supporting FeatureSet\t" + ssFsetIDs.get(i));
result = false;
continue;
}
else{ //fset status checks here
absentStates = getAbsentStates(efgCon, fsetStates, "feature_set", fsFsetIDs.get(i).toString());
if(absentStates.size() != 0){
problemSupportingFsets.add(fsFsetIDs.get(i));
}
if(dsDsetIDs.get(i) == null){ //dset check
ReportManager.problem(this, efgCon, "RegulatoryFeatures:" + fsInfo.get("cell_type") +
" has absent DataSet for supporting FeatureSet\t" + fsFsetIDs.get(i));
result = false;
continue;
}
else{ //dset status checks here
absentStates = getAbsentStates(efgCon, dsetStates, "data_set", dsDsetIDs.get(i).toString());
if(absentStates.size() != 0){
problemSupportingDsets.add(dsDsetIDs.get(i));
}
if(ssRsetIDs.get(i) == null){
ReportManager.problem(this, efgCon, "RegulatoryFeatures:" + fsInfo.get("cell_type")
+ " has absent supporting_set ResultSet for supporting DataSet\t" + dsDsetIDs.get(i));
result = false;
continue;
}
else{ //rset tests and status checks here
stmt = efgCon.createStatement();
rs = stmt.executeQuery("SELECT rs.name as rs_name, sn.name as sn_name, dbr.path from result_set rs " +
"LEFT JOIN (status s join status_name sn ON " +
"s.status_name_id=sn.status_name_id AND sn.name='RESULT_FEATURE_SET') " +
"ON rs.result_set_id=s.table_id AND s.table_name='result_set' " +
"LEFT JOIN dbfile_registry dbr ON rs.result_set_id=dbr.table_id AND dbr.table_name='result_set' " +
"WHERE rs.result_set_id=" + ssRsetIDs.get(i));
if(! rs.next()){
ReportManager.problem(this, efgCon, "RegulatoryFeatures:" + fsInfo.get("cell_type") +
" supporting DataSet has absent supporting ResultSet:\t" + ssRsetIDs.get(i));
result = false;
}
else{
if(rs.getString("sn_name") == null){ //Should be a Collection
//Test dbfile_registry entries match rset name
//Leave file tests to separate HC which only deals with the dbfile_registry table and files
String dbfPath = rs.getString("path");
//result_feature/Monocytes-CD14+_H3K27me3_ENCODE_Broad_bwa_samse
if(dbfPath == null){
//test result_set_input type too!
ReportManager.problem(this, efgCon,
"Could not find dbfile_registry entry for ResultSet which is " +
"not a RESULT_FEATURE_SET:\t" + ssRsetIDs.get(i));
result = false;
}
else if(! dbfPath.matches(".*" + Pattern.quote(rs.getString("rs_name")) + ".*")){//rset_name matches path?
// This now allows fuzzy matching on path to allow for 'versioned' data.
ReportManager.problem(this, efgCon,
"Found mismatch between ResultSet name and dbfile_registry.path:\t" +
rs.getString("rs_name") + " vs " + dbfPath);
result = false;
}
}
absentStates = getAbsentStates(efgCon, rsetStates, "result_set", ssRsetIDs.get(i).toString());
if(absentStates.size() != 0){
//do this for whole set of supporting rsets?
problemSupportingRsets.add(ssRsetIDs.get(i));
}
}
}//end of if(ssRsetIDs.get(i) == null){ else
}
}// end of if(fsFsetIDs.get(i) == null){ else
}//end of for loop
if(problemSupportingFsets.size() > 0){
if(sqlSafe){
usefulSQL = "\nUSEFUL SQL:\tINSERT IGNORE INTO status SELECT fs.feature_set_id, 'feature_set', sn.status_name_id from feature_set fs, status_name sn " +
"WHERE sn.name in (" + fsetStatesString + ") AND fs.feature_set_id IN (" +
problemSupportingFsets.toString().replaceAll("[\\[\\]]", "") + ");";
}
ReportManager.problem(this, efgCon, "Found absent states (from " + Arrays.toString(fsetStates) +
") for supporting FeatureSets:\t" + problemSupportingFsets.toString() + usefulSQL);
result = false;
}
if(problemSupportingDsets.size() > 0){
if(sqlSafe){
usefulSQL = "\nUSEFUL SQL:\tINSERT IGNORE INTO status SELECT ds.data_set_id, 'data_set', sn.status_name_id from data_set ds, status_name sn " +
"WHERE sn.name in (" + dsetStatesString + ") " +
"AND ds.data_set_id IN (" + problemSupportingDsets.toString().replaceAll("[\\[\\]]", "") + ");";
}
ReportManager.problem
(this, efgCon, "Found some absent states (from " + dsetStatesString
+ ") for supporting DataSet:\t" + problemSupportingDsets.toString() + usefulSQL);
result = false;
}
if(problemSupportingRsets.size() > 0){
if(sqlSafe){
usefulSQL = "\nUSEFUL SQL:\tINSERT IGNORE INTO status SELECT rs.result_set_id, 'result_set', sn.status_name_id from result_set rs, status_name sn " +
//Really need a java 'join' here
"WHERE sn.name in (" + rsetStatesString +
") AND rs.result_set_id IN (" + problemSupportingRsets.toString().replaceAll("[\\[\\]]", "") + ");";
}
ReportManager.problem
(
this, efgCon, "RegulatoryFeatures:" + fsInfo.get("cell_type") +
" supporting DataSets have supporting ResultSet with some absent states (from "
+ rsetStatesString + "):\t" + problemSupportingRsets.toString() + usefulSQL);
result = false;
}
}
//This doesn't need to be inside the try, apart from access to regSetCount
if(regSetCount > 0){
//Add bit to compare previous version and dates
//if there version has changed then we should expect at least
//one of the dates to change?
//Unless it is a patch version
//e.g. remove just a few problem features
//dates remain same but version goes from 12 to 12.1?
Connection secCon = getEquivalentFromSecondaryServer(dbre).getConnection();
if (secCon == null) {
logger.warning("Can't get equivalent database for " + dbre.getName());
return true;
}
//These are empty strings not nullsif not present
String sqlQuery = "select meta_value from meta where meta_key='regbuild.version'";
String oldRelVersion = DBUtils.getRowColumnValue(secCon, sqlQuery);
String newRelVersion = DBUtils.getRowColumnValue(efgCon, sqlQuery);
sqlQuery = "select meta_value from meta where meta_key=" +
"'regbuild.initial_release_date'";
String oldRelDate = DBUtils.getRowColumnValue(secCon, sqlQuery);
String newRelDate = DBUtils.getRowColumnValue(efgCon, sqlQuery);
sqlQuery = "select meta_value from meta where meta_key=" +
"'regbuild.last_annotation_update'";
String oldAnnoDate = DBUtils.getRowColumnValue(secCon, sqlQuery);
String newAnnoDate = DBUtils.getRowColumnValue(efgCon, sqlQuery);
//Deal with incomplete data first
if( newRelVersion.isEmpty() &&
newRelDate.isEmpty() &&
newAnnoDate.isEmpty() ){
ReportManager.warning
(this, efgCon,
"Found no current release version and/or dates, unable to complete test");
result = false;
}
else{
if( oldRelVersion.isEmpty() &&
oldRelDate.isEmpty() &&
oldAnnoDate.isEmpty() ){
ReportManager.warning
(this, efgCon,
"Found no previous release version or dates, assuming this is a virgin build");
//We have already done a test for all new values
//assume if there are defined then they are valid as we have nothing to compare them to.
}
else{
if( oldRelVersion.isEmpty() ||
oldRelDate.isEmpty() ||
oldAnnoDate.isEmpty() ){
ReportManager.problem
(this, efgCon,
"Previous release version and/or dates meta entries are incomplete, " +
"cannot complete regbuild dates test");
result = false;
}
else{ //WE have complete set of old an new values
if(! oldRelVersion.equals(newRelVersion) ){ // last_annotation_update should differ
if(! newAnnoDate.equals(oldAnnoDate) ){
ReportManager.problem
(this, efgCon,
"The regbuild.release version has changed, but the " +
"rebguild.last_annotation_update has not"
);
result = false;
}
if(! newRelDate.equals(oldRelDate) ){
ReportManager.warning
(this, efgCon,
"The regbuild.release version has changed, but the " +
"rebguild.initial_release_date has not"
);
}
}
else{ // rel version match so should dates!
if(! newAnnoDate.equals(oldAnnoDate) ){
ReportManager.warning
(this, efgCon,
"The last_annotation_update has changed but the release_version has not"
);
result = false;
}
if(! newRelDate.equals(oldRelDate) ){
ReportManager.warning
(this, efgCon,
"The initial_release_date has changed but the release_version has not"
);
result = false;
}
}
}
}
}
}//end of regbuild version data test
}
catch (SQLException se) {
//Does this exit and return false?
se.printStackTrace();
//This currently still returns PASSED and does print the 'problem'!
ReportManager.problem(this, efgCon, "Caught SQLException");
result = false;
}
return result;
}
//Move this to SingleFuncgenTestCase?
public ArrayList getAbsentStates(Connection efgCon, String[] statusNames, String tableName, String tableID){
ArrayList<String> absentStates = new ArrayList<String>();
try{
Statement stmt = efgCon.createStatement();
String sqlCmd = "";
for (int i = 0; i< statusNames.length; i++){
ResultSet rs = stmt.executeQuery("SELECT s.table_id from status s join status_name sn " +
"ON s.status_name_id=sn.status_name_id WHERE sn.name='" + statusNames[i] +
"' AND s.table_name='" + tableName + "' AND s.table_id=" + tableID);
if(! rs.next()){
absentStates.add(statusNames[i]);
}
}
}catch (SQLException se) {
//Does this exit and return false?
se.printStackTrace();
// ReportManager.problem(this, efgCon, "Caught SQLException");
}
return absentStates;
}
}