/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.funcgen; import java.sql.Connection; import java.util.HashMap; import java.util.Iterator; import java.sql.ResultSet; import java.sql.SQLException; import java.util.regex.Pattern; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.DatabaseType; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.Priority; import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase; import org.ensembl.healthcheck.util.DBUtils; public class FeaturePosition extends SingleDatabaseTestCase { /** * Create a new instance */ public FeaturePosition() { addToGroup("pre_regulatorybuild"); addToGroup("funcgen");//do we need this group and the funcgen-release group? addToGroup("funcgen-release"); setTeamResponsible(Team.FUNCGEN); setDescription("Checks if features lie within bounds of seq_region i.e. start !=0 and end <= seq_region length."); setPriority(Priority.AMBER); setEffect("Low quality features will be included which maybe the result of reads mapping to repeat regions at end of seq_regions."); setFix("Fix: See DELETE SQL. These should have been filtered within the pipeline!"); } /** * This only applies to funcgen databases. */ public void types() { //Do we really need these removes? removeAppliesToType(DatabaseType.OTHERFEATURES); removeAppliesToType(DatabaseType.CDNA); removeAppliesToType(DatabaseType.CORE); removeAppliesToType(DatabaseType.VARIATION); removeAppliesToType(DatabaseType.COMPARA); } /** * Run the test. * We will check if all the motif features in a regulatory feature contain all * the motif features associated to the annotated features associated to the regulatory feature * * @param dbre * The database to use. * @return true if the test passed. * */ public boolean run(DatabaseRegistryEntry dbre) { if (Pattern.matches("master_schema_funcgen_\\d+", dbre.getName())) { logger.fine("Skipping " + dbre.getName()); return true; } boolean result = true; Connection efgCon = dbre.getConnection(); String schemaBuild = dbre.getSchemaVersion() + "_" + dbre.getGeneBuildVersion(); String coreDBName = dbre.getSpecies() + "_core_" + schemaBuild; DatabaseRegistryEntry coreDbre = getDatabaseRegistryEntryByPattern(coreDBName); if (coreDbre == null){ ReportManager.problem(this, efgCon, "Could not access default core DB:\t" + coreDBName); return false; } /* String sql = "SELECT schema_build from coord_system order by schema_build desc limit 1"; String schemaBuild = DBUtils.getRowColumnValue(efgCon, sql); */ String sql = "select sr.core_seq_region_id, sr.name, sr.seq_region_id from seq_region sr where schema_build='" + schemaBuild + "'"; HashMap<String, String> coreSeqRegionIDName = new HashMap<String, String>(); HashMap<String, String> nameFuncgenSeqRegionID = new HashMap<String, String>(); try { ResultSet rs = efgCon.createStatement().executeQuery(sql); while (rs.next()){ coreSeqRegionIDName.put(rs.getString(1), rs.getString(2)); nameFuncgenSeqRegionID.put(rs.getString(2), rs.getString(3)); } } catch (SQLException se) { se.printStackTrace(); return false; } Connection coreCon = coreDbre.getConnection(); HashMap<String, String> seqRegionLen = new HashMap<String, String>(); for (Iterator<String> iter = coreSeqRegionIDName.keySet().iterator(); iter.hasNext();) { String coreSrID = (String) iter.next(); seqRegionLen.put(coreSrID, DBUtils.getRowColumnValue(coreCon, "select length from seq_region where seq_region_id=" + coreSrID) ); } //Shouldn't these be defined somewhere more generic? String [] featureTables = {"annotated_feature", "regulatory_feature", "motif_feature", "external_feature", "segmentation_feature", "mirna_target_feature"}; for(String fTable : featureTables){ String problemString = ""; String usefulSQL = ""; String updateSQL = ""; int totalFeatures = 0; Iterator<String> it = seqRegionLen.keySet().iterator(); while(it.hasNext()){ String coreRegionId = it.next(); String srName = coreSeqRegionIDName.get(coreRegionId); String funcgenRegionID = nameFuncgenSeqRegionID.get(srName); String srLength = seqRegionLen.get(coreRegionId); //Using efg sr_id removes need for use of schema_build and sr join //start = 0 as is unsigned i.e. never <0 //only need the bound calc here, as it will be more extreme or equal to seq_region loci if(fTable.equals("regulatory_feature")){ sql = "select count(" + fTable + "_id) from " + fTable + " WHERE seq_region_id=" + funcgenRegionID + " AND ((seq_region_start - bound_start_length) = 0 " + "OR (seq_region_end + bound_end_length) > " + srLength + ")"; } else{ sql = "select count(" + fTable + "_id) from " + fTable + " WHERE seq_region_id=" + funcgenRegionID + " AND (seq_region_start = 0 OR seq_region_end > " + srLength + ")"; } Integer featCount = DBUtils.getRowCount(efgCon, sql); totalFeatures += featCount; //This is already being 'caught' higher in the stack, but no exit //but still shows as 'PASSED' as result is true by default! //featCount is -1 not null if sql failed if(featCount == -1){ ReportManager.problem(this, efgCon, "SQL Failed:\t" + sql); return false; } if(featCount > 0){ //Delete as we never trust peaks over ends of sequencable regions, as they are likely //the start of long ranging repeats where alignments stack up erroneously /** if(fTable.equals("regulatory_feature")){ deleteSQL += "DELETE ra, rf from regulatory_feature rf join " + "regulatory_attribute ra using (regulatory_feature_id) WHERE seq_region_id=" + funcgenRegionID + " AND ((seq_region_start - bound_start_length) = 0 " + "OR (seq_region_end + bound_end_length) > " + srLength + ");\n" ; } else{ deleteSQL += "DELETE from " + fTable + " WHERE seq_region_id=" + funcgenRegionID + " AND (seq_region_start = 0 OR seq_region_end > " + srLength + ");\n"; **/ updateSQL += "UPDATE " + fTable + " set seq_region_end =" + srLength + " WHERE seq_region_id=" + funcgenRegionID + " AND seq_region_end > " + srLength + ";\n"; updateSQL += "UPDATE " + fTable + " set seq_region_start=1 WHERE seq_region_id=" + funcgenRegionID + " AND seq_region_start = 0;\n"; //} usefulSQL += sql + ";\n"; problemString = problemString + " " + srName + "(" + featCount + ")"; result = false; } } if(! problemString.isEmpty() ){ ReportManager.problem (this, efgCon, "Found " + totalFeatures + " " + fTable + "s exceeding seq_region bounds:\t" + problemString + "\nUSEFUL SQL:\n" + usefulSQL + "\nUPDATE SQL:\n" + updateSQL); } } return result; } }