/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ensembl.healthcheck.testcase.funcgen;
import java.sql.Connection;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.DatabaseType;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.testcase.Priority;
import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase;
import org.ensembl.healthcheck.util.DBUtils;
public class RegulatoryMotifFeatures extends SingleDatabaseTestCase {
/**
* Create a new instance of StableID.
*/
public RegulatoryMotifFeatures() {
addToGroup("post_regulatorybuild");
addToGroup("funcgen");//do we need this group and the funcgen-release group?
addToGroup("funcgen-release");
//setHintLongRunning(true); // should be relatively fast
setTeamResponsible(Team.FUNCGEN);
setDescription("Checks if all motifs from annotated features are associated to their respective regulatory features.");
setPriority(Priority.AMBER);
setEffect("Regulatory Features will seem to miss some motif features.");
setFix("Re-project motif features or fix manually.");
}
/**
* This only applies to funcgen databases.
*/
public void types() {
//Do we really need these removes?
removeAppliesToType(DatabaseType.OTHERFEATURES);
removeAppliesToType(DatabaseType.CDNA);
removeAppliesToType(DatabaseType.CORE);
removeAppliesToType(DatabaseType.VARIATION);
removeAppliesToType(DatabaseType.COMPARA);
}
/**
* Run the test.
* We will check if all the motif features in a regulatory feature contain all
* the motif features associated to the annotated features associated to the regulatory feature
*
* @param dbre
* The database to use.
* @return true if the test passed.
*
*/
public boolean run(DatabaseRegistryEntry dbre) {
boolean result = true;
Connection con = dbre.getConnection();
// NOTE: SQL strings unbroken to allow easy copying into MySQL client
// Restricts to current i.e. non-archive fset which have a v[0-9]+ suffix
// This has to be a distnct count here as MF reg attrs are non-redundant across AFs within an RF
// Hence the 2nd count would always be higher
// Need change this to iterate over vell type build to capture differences within cell lines, which maybe currently masked
// by the cell type wide distinct count.
int regMFs = DBUtils.getRowCount(con, "SELECT count(distinct attribute_feature_id) from regulatory_attribute ra join regulatory_feature rf on ra.regulatory_feature_id=rf.regulatory_feature_id and attribute_feature_table='motif' join feature_set fs on rf.feature_set_id=fs.feature_set_id and fs.name not rlike '.*_v[0-9]+'");
//"SELECT COUNT(distinct attribute_feature_id) from regulatory_attribute where attribute_feature_table='motif'"
int fmaxLength = 2000; // Accounts for potential out of bounds MFs from demoted TFs
int regAMFs = DBUtils.getRowCount
( con, "select count(distinct amf.motif_feature_id) from associated_motif_feature amf, annotated_feature af, regulatory_attribute ra, regulatory_feature rf, feature_set fs where af.annotated_feature_id=amf.annotated_feature_id and ra.attribute_feature_id=af.annotated_feature_id and ra.attribute_feature_table='annotated' and (af.seq_region_end - af.seq_region_start +1) <= " + fmaxLength +" and ra.regulatory_feature_id=rf.regulatory_feature_id and rf.feature_set_id=fs.feature_set_id and fs.name not rlike '.*_v[0-9]+'");
if(regMFs != regAMFs){
// This incorporates mfs for afs < 2000bp where the af has been integrated into the rf, but the mf hasn't for some reason?
// Or when we have deleted an af with but not an associated mf that were both supporting an rf
ReportManager.problem
( this, con, "The number of total non-distinct motif features associated to regulatory features (" + regMFs +
") does not correspond to the number of distinct motif features within its associated annotated features ("
+ regAMFs + ") which are less than " + fmaxLength + " bp\n" +
"USEFUL SQL:\nALTER table regulatory_attribute add index `attribute_id_type`(attribute_feature_table, attribute_feature_id);\n" +
"insert ignore into regulatory_attribute select ra.regulatory_feature_id, amf.motif_feature_id, 'motif' from " +
"annotated_feature af, regulatory_attribute ra, associated_motif_feature amf left join " +
"regulatory_attribute ra1 on (amf.motif_feature_id=ra1.attribute_feature_id and ra1.attribute_feature_table='motif') " +
"where af.annotated_feature_id=amf.annotated_feature_id and ra.attribute_feature_id=af.annotated_feature_id and " +
"ra.attribute_feature_table='annotated' and (af.seq_region_end - af.seq_region_start +1) <= " +
fmaxLength + " and ra1.attribute_feature_id is NULL;\nALTER table regulatory_attribute drop index `attribute_id_type`;"
);
result = false;
}
int outOfBoundMFs = DBUtils.getRowCount
(con,
"SELECT count(mf.motif_feature_id) FROM feature_set fs, regulatory_feature rf, regulatory_attribute ra, motif_feature mf WHERE fs.feature_set_id=rf.feature_set_id AND rf.regulatory_feature_id=ra.regulatory_feature_id AND ra.attribute_feature_table='motif' AND ra.attribute_feature_id=mf.motif_feature_id AND fs.name not rlike '.*_v[0-9]+' AND ( (mf.seq_region_end < (rf.seq_region_start - rf.bound_start_length)) OR (mf.seq_region_start > (rf.seq_region_end + rf.bound_end_length)))");
if(outOfBoundMFs != 0){
ReportManager.problem
( this, con,
"Found " + outOfBoundMFs + " MotifFeatures which lie outside the core region. USEFUL SQL:\n" +
"SELECT mf.motif_feature_id, mf.seq_region_start as 'mf start', mf.seq_region_end as 'mf end', rf.regulatory_feature_id as 'rf ID', rf.seq_region_start as 'rf seq start', rf.seq_region_end as 'rf seq end', rf.bound_start_length as 'bound start', rf.bound_end_length as 'bound end', rf.seq_region_start - rf.bound_start_length as 'real start', rf.seq_region_end + rf.bound_end_length as 'real end', lpad(rf.stable_id, 11, 0) FROM feature_set fs, regulatory_feature rf, regulatory_attribute ra, motif_feature mf WHERE fs.feature_set_id = rf.feature_set_id and rf.regulatory_feature_id = ra.regulatory_feature_id and ra.attribute_feature_table = 'motif' and ra.attribute_feature_id = mf.motif_feature_id and fs.name not rlike '.*_v[0-9]+' AND ( (mf.seq_region_end < (rf.seq_region_start - rf.bound_start_length)) OR (mf.seq_region_start > (rf.seq_region_end + rf.bound_end_length)))");
result = false;
}
return result;
}
}