/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.funcgen; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.DatabaseType; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.Repair; import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase; /** * @author mnuhn * */ public class DuplicateProbeFeatures extends SingleDatabaseTestCase implements Repair { final private int maxUsefulSqlStatements = 10; protected Connection con; public DuplicateProbeFeatures() { setTeamResponsible(Team.FUNCGEN); setDescription("Tests for duplicate probe features and can remove them."); } public void types() { appliesToType(DatabaseType.FUNCGEN); } protected List<String> usefulSql() throws SQLException { ResultSet rs = getDuplicates(); List<String> sqlStatements = new ArrayList<String>(); while (rs.next() && sqlStatements.size()<maxUsefulSqlStatements) { String commaSeparatedIdList = rs.getString("ids"); String[] ids = commaSeparatedIdList.split(","); StringBuffer idsToDelete = new StringBuffer(); int idsInCurrentStatement = 0; // Don't skip the first in the list, we want to show all. for(int i=0; i<ids.length; i++) { idsToDelete.append(ids[i]); idsInCurrentStatement++; if (idsInCurrentStatement>=10) { sqlStatements.add("select * from probe_feature where probe_feature_id in (" + idsToDelete + ");"); idsToDelete = new StringBuffer(); idsInCurrentStatement = 0; } else { // No comma after the last one. if (i+1<ids.length) { idsToDelete.append(", "); } } } if (idsToDelete.length()>0) { sqlStatements.add("select * from probe_feature where probe_feature_id in (" + idsToDelete + ");"); } } return sqlStatements; } protected List<String> repairSql() throws SQLException { final int maxIdsInOneStatement = 10; ResultSet rs = getDuplicates(); List<String> sqlStatements = new ArrayList<String>(); while (rs.next()) { String commaSeparatedIdList = rs.getString("ids"); String[] ids = commaSeparatedIdList.split(","); StringBuffer idsToDelete = new StringBuffer(); int idsInCurrentStatement = 0; // Skip the first in the list, the others get deleted. for(int i=1; i<ids.length; i++) { idsToDelete.append(ids[i]); idsInCurrentStatement++; if (idsInCurrentStatement>=maxIdsInOneStatement) { sqlStatements.add("delete from probe_feature where probe_feature_id in (" + idsToDelete + ");"); idsToDelete = new StringBuffer(); idsInCurrentStatement = 0; } else { // No comma after the last one. if (i+1<ids.length) { idsToDelete.append(", "); } } } if (idsToDelete.length()>0) { sqlStatements.add("delete from probe_feature where probe_feature_id in (" + idsToDelete + ");"); } } return sqlStatements; } protected ResultSet getDuplicates() throws SQLException { Statement stmt = con.createStatement(java.sql.ResultSet.TYPE_FORWARD_ONLY, java.sql.ResultSet.CONCUR_READ_ONLY); stmt.setFetchSize((Integer.MIN_VALUE)); // By default group_concat gets cut off after 1000 characters, we // don't want to loose any, so setting this to a higher value. // stmt.executeQuery("SET SESSION group_concat_max_len = 1000000"); ResultSet rs = stmt.executeQuery(getDuplicateSql()); return rs; } protected int getNumDuplicates() throws SQLException { Statement stmt = con.createStatement(); ResultSet rs = stmt.executeQuery(getNumDuplicatesSql()); rs.next(); int numDuplicates = rs.getInt("numDuplicates"); return numDuplicates; } protected String getDuplicateSql() { return "SELECT count(probe_feature_id) num_occurrences, group_concat(cast(probe_feature_id as char)) ids, seq_region_id, seq_region_start, seq_region_end, probe_id, analysis_id, cigar_line " + "FROM probe_feature " + "group by seq_region_id, seq_region_start, seq_region_end, probe_id, analysis_id, cigar_line " + "having num_occurrences > 1 order by num_occurrences desc"; } protected String getNumDuplicatesSql() { return "select count(*) numDuplicates from (" + getDuplicateSql() + ") duplicate_list"; } public boolean run(DatabaseRegistryEntry dbre) { boolean testPassed; String sql = "SELECT count(*) as numberOfRows FROM probe_feature;"; con = dbre.getConnection(); try { Statement stmt = con.createStatement(); ResultSet rs = stmt.executeQuery(sql); rs.next(); int numberOfRows = rs.getInt("numberOfRows"); logger.info("There are " + numberOfRows + " rows in the probe_feature table."); int numDuplicates = getNumDuplicates(); if (numDuplicates > 0) { numDuplicates = getNumDuplicates(); String msg = "Has " + numDuplicates + " duplicated probe features."; logger.severe(msg); ReportManager.problem(this, con, msg); List<String >sqlStatements = usefulSql(); logger.info("Useful sql:"); Iterator<String> i = sqlStatements.iterator(); while(i.hasNext()) { logger.info(i.next()); } testPassed = false; } else { String msg = "No duplicate probe features found"; logger.info(msg); ReportManager.correct(this, con, msg); testPassed = true; } } catch (SQLException e) { testPassed = false; e.printStackTrace(); return testPassed; } return testPassed; } @Override public void repair(DatabaseRegistryEntry dbre) { showOrRepair(dbre, false); } @Override public void show(DatabaseRegistryEntry dbre) { showOrRepair(dbre, true); } public void showOrRepair(DatabaseRegistryEntry dbre, boolean onlyShow) { this.con = dbre.getConnection(); List<String> repairSqlStatements; try { Statement stmt = con.createStatement(); repairSqlStatements = repairSql(); Iterator<String> iterator = repairSqlStatements.listIterator(); while (iterator.hasNext()) { String sql = iterator.next(); if (!onlyShow) { logger.info("Executing: " + sql); stmt.executeUpdate(sql); } else { logger.info("Showing only: " + sql); } } stmt.close(); } catch (SQLException e) { e.printStackTrace(); return; } } }