/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* GoTermCount
*
* @author dstaines
* @author $Author$
* @version $Revision$
*/
package org.ensembl.healthcheck.testcase.eg_core;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.util.SqlTemplate;
/**
* Test to check that at least 50% of protein coding genes have at least one GO
* term
*
* @author dstaines
*
*/
public class GoTermCount extends AbstractEgCoreTestCase {
private final static String GENE_COUNT_SQL = "select count(distinct(gene.gene_id)) from gene "
+ "join seq_region using (seq_region_id) "
+ "join coord_system using (coord_system_id) "
+ "where gene.biotype='protein_coding' and species_id=?";
private final static String GO_TRANSLATION_COUNT_SQL = "select count(distinct(gene.gene_id)) from gene "
+ "join seq_region using (seq_region_id) "
+ "join coord_system using (coord_system_id) "
+ "join transcript using (gene_id) "
+ "join translation using (transcript_id) "
+ "join object_xref on (translation_id=ensembl_id AND ensembl_object_type='Translation') "
+ "join xref using (xref_id) "
+ "join external_db using (external_db_id) "
+ "join ontology_xref using (object_xref_id) "
+ "where gene.biotype='protein_coding' and external_db.db_name='GO' and species_id=?";
private final static String GO_TRANSCRIPT_COUNT_SQL = "select count(distinct(gene.gene_id)) from gene "
+ "join seq_region using (seq_region_id) "
+ "join coord_system using (coord_system_id) "
+ "join transcript using (gene_id) "
+ "join object_xref on (transcript_id=ensembl_id AND ensembl_object_type='Transcript') "
+ "join xref using (xref_id) "
+ "join external_db using (external_db_id) "
+ "join ontology_xref using (object_xref_id) "
+ "where gene.biotype='protein_coding' and external_db.db_name='GO' and species_id=?";
private final static String GO_GENE_COUNT_SQL = "select count(distinct(gene.gene_id)) from gene "
+ "join seq_region using (seq_region_id) "
+ "join coord_system using (coord_system_id) "
+ "join object_xref on (gene_id=ensembl_id AND ensembl_object_type='Gene') "
+ "join xref using (xref_id) "
+ "join external_db using (external_db_id) "
+ "join ontology_xref using (object_xref_id) "
+ "where gene.biotype='protein_coding' and external_db.db_name='GO' and species_id=?";
private final static double THRESHOLD = 0.4;
public GoTermCount() {
super();
}
/*
* (non-Javadoc)
*
* @see
* org.ensembl.healthcheck.testcase.AbstractTemplatedTestCase#runTest(org
* .ensembl.healthcheck.DatabaseRegistryEntry)
*/
@Override
protected boolean runTest(DatabaseRegistryEntry dbre) {
boolean result = true;
SqlTemplate temp = getSqlTemplate(dbre);
// count number genes
for (int speciesId : dbre.getSpeciesIds()) {
int geneN = temp.queryForDefaultObject(GENE_COUNT_SQL,
Integer.class, speciesId);
// count number of genes with at least 1 GO term (try gene,
// transcript,
// translation)
int goN = temp.queryForDefaultObject(GO_TRANSLATION_COUNT_SQL,
Integer.class, speciesId);
if (goN == 0) {
goN = temp.queryForDefaultObject(GO_TRANSCRIPT_COUNT_SQL,
Integer.class, speciesId);
}
if (goN == 0) {
goN = temp.queryForDefaultObject(GO_GENE_COUNT_SQL,
Integer.class, speciesId);
}
double ratio = (double) goN / geneN;
if (ratio < THRESHOLD) {
ReportManager.problem(this, dbre.getConnection(), goN
+ " protein_coding genes of a total of " + geneN
+ " for species " + speciesId
+ " have at least one GO term -"
+ " this is less than the suggested threshold of "
+ THRESHOLD * 100 + "%");
result = false;
} else {
ReportManager.info(this, dbre.getConnection(), goN
+ " protein_coding genes of a total of " + geneN
+ " for species " + speciesId
+ " have at least one GO term");
}
}
return result;
}
/*
* (non-Javadoc)
*
* @see org.ensembl.healthcheck.testcase.eg_core.AbstractEgCoreTestCase#
* getEgDescription()
*/
@Override
protected String getEgDescription() {
return "Test to check that at least " + THRESHOLD * 100
+ "% of protein coding genes have at least one GO term";
}
}