/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.eg_core; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.DatabaseType; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.AbstractTemplatedTestCase; import org.ensembl.healthcheck.util.SqlTemplate; /** * test for minimal level of uniprot coverage to ensure all cases are checked * manually * * @author dstaines * */ public class UniProtKB_Coverage extends AbstractTemplatedTestCase { public UniProtKB_Coverage() { super(); this.addToGroup(AbstractEgCoreTestCase.EG_GROUP); this.appliesToType(DatabaseType.CORE); this.setTeamResponsible(Team.ENSEMBL_GENOMES); } private final static double THRESHOLD = 90.0; private final static String QUERY_UNIPROT = "SELECT count(distinct(g.gene_id)) " + "FROM gene g join transcript t using (gene_id) " + "join translation tl using (transcript_id) " + "join object_xref ox on (tl.translation_id=ox.ensembl_id and ox.ensembl_object_type='Translation') " + "join xref x using (xref_id) join external_db d using (external_db_id) " + "join seq_region s on (s.seq_region_id=g.seq_region_id) " + "join coord_system using (coord_system_id) " + "WHERE g.biotype='protein_coding' " + "AND d.db_name IN " + "('Uniprot/SPTREMBL','Uniprot/SPTREMBL_predicted','Uniprot/SWISSPROT','Uniprot/SWISSPROT_predicted') " + "and species_id=?"; private final static String QUERY_GENES = "select count(*) from gene " + "join seq_region using (seq_region_id) " + "join coord_system using (coord_system_id) where biotype='protein_coding' and species_id=?"; @Override protected boolean runTest(DatabaseRegistryEntry dbre) { boolean result = true; SqlTemplate template = getSqlTemplate(dbre); for (int speciesId : dbre.getSpeciesIds()) { int nProteinCoding = template.queryForDefaultObject(QUERY_GENES, Integer.class, speciesId); if (nProteinCoding == 0) { ReportManager.problem(this, dbre.getConnection(), "No protein coding genes found!"); result = false; continue; } int nUniProt = template.queryForDefaultObject(QUERY_UNIPROT, Integer.class, speciesId); double ratio = (100.0 * nUniProt) / nProteinCoding; if (ratio < THRESHOLD) { ReportManager .problem( this, dbre.getConnection(), "Less than " + THRESHOLD + "% of protein_coding genes for species " + speciesId + " have a UniProtKB xref (" + nUniProt + "/" + nProteinCoding + "): this may be correct for some genomes so please check and annotate accordingly"); result = false; } } return result; } }