/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.generic; import java.sql.Connection; import java.sql.ResultSet; import java.sql.Statement; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.DatabaseType; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase; /** * An EnsEMBL Healthcheck test case which checks if any genes are obvious duplicates of each other (it might be all OK, but it's * worth a look!) */ public class DuplicateGenes extends SingleDatabaseTestCase { private static final int MAX_WARNINGS = 10; /** * Create an OrphanTestCase that applies to a specific set of databases. */ public DuplicateGenes() { setTeamResponsible(Team.GENEBUILD); } /** * This test only applies to core and Vega databases. */ public void types() { removeAppliesToType(DatabaseType.OTHERFEATURES); removeAppliesToType(DatabaseType.CDNA); removeAppliesToType(DatabaseType.RNASEQ); } /** * Check for (strongly likely to be) duplicate genes. * * @param dbre * The database to check. * @return True if the test passes. */ public boolean run(DatabaseRegistryEntry dbre) { boolean result = true; String sql = "SELECT g.gene_id, g.seq_region_start AS start, g.seq_region_end AS end, g.seq_region_id AS chromosome_id, g.seq_region_strand AS strand, g.biotype, g.stable_id, g.analysis_id, g.display_xref_id, g.source, g.description, g.is_current, g.canonical_transcript_id " + " FROM gene g ORDER BY chromosome_id, strand, start, end"; Connection con = dbre.getConnection(); try { Statement stmt = con.createStatement(java.sql.ResultSet.TYPE_FORWARD_ONLY, java.sql.ResultSet.CONCUR_READ_ONLY); stmt.setFetchSize(1000); ResultSet rs = stmt.executeQuery(sql); int geneStart, geneEnd, geneChromosome, geneId, geneStrand; int lastGeneId = 0; int lastGeneStart = -1; int lastGeneEnd = -1; int lastGeneChromosome = -1; int lastGeneStrand = -1; int duplicateGene = 0; int geneAnalysis = -1; int geneDisplayXref = -1; String geneSource = ""; String geneDescription = ""; int geneIsCurrent = -1; int geneCanonicalTranscript = -1; String geneCanonicalAnnotation = ""; String geneBioType, geneStableID; String lastGeneBioType = ""; String lastGeneStableID = ""; int lastGeneAnalysis = -1; int lastGeneDisplayXref = -1; String lastGeneSource = ""; String lastGeneDescription = ""; int lastGeneIsCurrent = -1; int lastGeneCanonicalTranscript = -1; String lastGeneCanonicalAnnotation = ""; boolean first = true; while (rs.next()) { // load the vars geneId = rs.getInt(1); geneStart = rs.getInt(2); geneEnd = rs.getInt(3); geneChromosome = rs.getInt(4); geneStrand = rs.getInt(5); geneBioType = rs.getString(6); geneStableID = rs.getString(7); geneAnalysis = rs.getInt(8); geneDisplayXref = rs.getInt(9); geneSource = rs.getString(10); geneDescription = rs.getString(11); geneIsCurrent = rs.getInt(12); geneCanonicalTranscript = rs.getInt(13); // canonical_annotation removed in 74 // geneCanonicalAnnotation = rs.getString(15); if (!first) { // for sangervega, we only want to report true duplicates (i.e. genes that have all fields identical) if (lastGeneChromosome == geneChromosome && lastGeneStart == geneStart && lastGeneEnd == geneEnd && lastGeneStrand == geneStrand && geneBioType.equals(lastGeneBioType) && (dbre.getType() != DatabaseType.SANGER_VEGA || (lastGeneAnalysis == geneAnalysis && lastGeneDisplayXref == geneDisplayXref && lastGeneSource == geneSource && lastGeneDescription == geneDescription && lastGeneIsCurrent == geneIsCurrent && lastGeneCanonicalTranscript == geneCanonicalTranscript && lastGeneCanonicalAnnotation == geneCanonicalAnnotation))) { duplicateGene++; if (duplicateGene < MAX_WARNINGS) { ReportManager.warning(this, con, "Gene " + geneStableID + " (" + geneBioType + " ID " + geneId + ") is duplicated - see gene " + lastGeneStableID + " (" + lastGeneBioType + " ID " + lastGeneId + ")"); } } } else { first = false; } lastGeneId = geneId; lastGeneStart = geneStart; lastGeneEnd = geneEnd; lastGeneChromosome = geneChromosome; lastGeneStrand = geneStrand; lastGeneBioType = geneBioType; lastGeneStableID = geneStableID; lastGeneAnalysis = geneAnalysis; lastGeneDisplayXref = geneDisplayXref; lastGeneSource = geneSource; lastGeneDescription = geneDescription; lastGeneIsCurrent = geneIsCurrent; lastGeneCanonicalTranscript = geneCanonicalTranscript; lastGeneCanonicalAnnotation = geneCanonicalAnnotation; } // while rs if (duplicateGene > 0) { ReportManager.problem(this, con, "Has " + duplicateGene + " duplicated genes."); result = false; } rs.close(); stmt.close(); } catch (Exception e) { result = false; e.printStackTrace(); } // EG return correct report line if all is OK if (result) ReportManager.correct(this, con, "No duplicate genes found"); return result; } } // DuplicateGenes