/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ensembl.healthcheck.testcase.generic;
import java.sql.Connection;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.DatabaseType;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase;
import org.ensembl.healthcheck.util.DBUtils;
/**
* Check if protein_coding genes have a canonical transcript that has a valid translation. See also canonical_transcript checks in
* CoreForeignKeys.
*/
public class CanonicalTranscriptCoding extends SingleDatabaseTestCase {
/**
* Create a new instance of CanonicalTranscriptCoding.
*/
public CanonicalTranscriptCoding() {
setDescription("Check if protein_coding genes have a canonical transcript that has a valid translation. Also check than number of canonical transcripts is correct. See also canonical_transcript checks in CoreForeignKeys.");
setTeamResponsible(Team.CORE);
setSecondTeamResponsible(Team.GENEBUILD);
}
public void types() {
removeAppliesToType(DatabaseType.SANGER_VEGA);
removeAppliesToType(DatabaseType.VEGA);
removeAppliesToType(DatabaseType.OTHERFEATURES);
}
/**
* Run the test.
*
* @param dbre
* The database to use.
* @return true if the test passed.
*
*/
public boolean run(DatabaseRegistryEntry dbre) {
boolean result = true;
Connection con = dbre.getConnection();
// --------------------------------
// Check all canonical transcripts in a gene correspond to a transcript and all canonical_transcript_ids correspond to a gene
result &= checkForOrphans(con, "gene", "canonical_transcript_id", "transcript", "transcript_id", true);
int rows = DBUtils.getRowCount(con, "SELECT COUNT(*) FROM gene g, transcript t where g.canonical_transcript_id=" + "t.transcript_id and g.gene_id <> t.gene_id");
if (rows > 0) {
// problem, the canonical transcript does not belong to the gene
String useful_sql = "SELECT g.gene_id,g.canonical_transcript_id FROM gene g, transcript t where g.canonical_transcript_id=" + "t.transcript_id and g.gene_id <> t.gene_id";
ReportManager.problem(this, con, rows + " rows in gene have a canonical transcript it doesn't belong to the gene" + " Try '" + useful_sql + "' to find out the offending genes");
result = false;
}
if (dbre.getType() == DatabaseType.SANGER_VEGA || dbre.getType() == DatabaseType.CORE) {
result &= checkBiotypes(dbre);
}
return result;
}
private boolean checkBiotypes(DatabaseRegistryEntry dbre) {
boolean result = true;
Connection con = dbre.getConnection();
// --------------------------------
// A gene that has at least one transcript.biotype='protein_coding' should have gene.biotype='protein_coding'
String sql = "SELECT COUNT(*) FROM gene g WHERE g.gene_id IN (SELECT tr.gene_id FROM transcript tr WHERE tr.biotype='protein_coding') AND g.biotype NOT IN ('protein_coding', 'polymorphic_pseudogene')";
if (dbre.getType() == DatabaseType.SANGER_VEGA) {// for sangervega ignore genes that do not have source havana or WU
sql += "AND g.biotype!='polymorphic' AND g.biotype!='polymorphic_pseudogene' and (g.source='havana' or g.source='WU')";
}
int rows = DBUtils.getRowCount(con, sql);
if (rows > 0) {
result = false;
String report = " genes with at least one protein_coding transcript do not have biotype protein_coding";
if (dbre.getType() == DatabaseType.SANGER_VEGA) {
report += " or polymorphic_pseudogene";
}
ReportManager.problem(this, con, rows + report);
} else {
ReportManager.correct(this, con, "All genes with protein_coding transcripts have protein_coding biotype");
}
// --------------------------------
// Protein_coding transcripts should all have translations
sql = "SELECT count(*) FROM transcript tr join gene g on tr.gene_id=g.gene_id WHERE tr.biotype='protein_coding' AND tr.transcript_id NOT IN (SELECT transcript_id from translation)";
if (dbre.getType() == DatabaseType.SANGER_VEGA) {
sql += " and (g.source='havana' or g.source='WU')";
}
rows = DBUtils.getRowCount(con, sql);
if (rows > 0) {
result = false;
ReportManager.problem(this, con, rows + " protein_coding transcripts do not have translations\nUSEFUL SQL: SELECT transcript.transcript_id,transcript.analysis_id FROM transcript LEFT JOIN translation ON transcript.transcript_id = translation.transcript_id WHERE transcript.biotype = 'protein_coding' and translation.transcript_id IS NULL; ");
} else {
ReportManager.correct(this, con, "All protein_coding transcripts have translations");
}
// --------------------------------
// All genes should have a canonical transcript
sql = "SELECT COUNT(*) FROM gene g WHERE g.canonical_transcript_id is NULL";
if (dbre.getType() == DatabaseType.SANGER_VEGA) {// for sangervega ignore genes that do not have source havana or WU
sql += " and (g.source='havana' or g.source='WU')";
}
rows = DBUtils.getRowCount(con, sql);
if (rows > 0) {
result = false;
ReportManager.problem(this, con, rows + " genes do not have a canonical transcript");
} else {
ReportManager.correct(this, con, "All genes have a canonical transcript");
}
// --------------------------------
// All canonical transcripts with a translation should belong to a gene with a biotype of 'protein_coding',
// 'IG_C_gene','IG_D_gene','IG_J_gene', 'IG_V_gene' or 'RNA-Seq_gene'
sql = "SELECT COUNT(*) FROM gene g WHERE g.canonical_transcript_id IN (SELECT tr.transcript_id FROM transcript tr, translation tl WHERE tr.transcript_id=tl.transcript_id) AND g.biotype NOT IN ('protein_coding','IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene','IG_LV_gene','RNA-Seq_gene','polymorphic_pseudogene','TR_C_gene','TR_J_gene','TR_V_gene','TR_D_gene','LRG_gene','nontranslating_cds'";
if (dbre.getType() == DatabaseType.SANGER_VEGA) {// for sangervega ignore genes that do not have source havana or WU
sql += ", 'polymorphic','IG_gene','TR_gene') and (g.source='havana' or g.source='WU')";
} else {
sql += ")";
}
rows = DBUtils.getRowCount(con, sql);
if (rows > 0) {
result = false;
ReportManager.problem(this, con, rows + " genes with canonical transcripts have the wrong biotype");
} else {
ReportManager.correct(this, con, "All genes with canonical transcripts have the correct biotype");
}
// --------------------------------
// None of the transcripts that have a translation and have a biotype different to
// ('protein_coding','IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene')) should be canonical transcripts to any gene.
sql = "SELECT COUNT(distinct g.gene_id) FROM gene g WHERE g.canonical_transcript_id IN (select tr.transcript_id FROM transcript tr, translation tl WHERE tr.transcript_id=tl.transcript_id AND tr.biotype NOT IN ('protein_coding','IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene', 'IG_LV_gene', 'TR_C_gene','TR_J_gene','TR_V_gene', 'TR_D_gene','LRG_gene', 'nonsense_mediated_decay','polymorphic_pseudogene','non_stop_decay','nontranslating_cds'";
if (dbre.getType() == DatabaseType.SANGER_VEGA) {// for sangervega ignore genes that do not have source havana or WU
sql += ", 'polymorphic','IG_gene','TR_gene')) and (g.source='havana' or g.source='WU')";
} else {
sql += "))";
}
rows = DBUtils.getRowCount(con, sql);
if (rows > 0) {
result = false;
ReportManager.problem(this, con, rows + " genes have canonical transcripts with mismatched biotypes");
} else {
ReportManager.correct(this, con, "All genes have canonical transcripts with matching biotypes");
}
// --------------------------------
// A gene that has gene.biotype='protein_coding' and has at least one transcript.biotype='protein_coding' should have a
// canonical transcript.biotype='protein_coding'.
sql = "SELECT count(distinct g.gene_id) FROM gene g JOIN transcript t USING (gene_id) WHERE g.gene_id IN (SELECT g.gene_id FROM gene g JOIN transcript t ON (g.canonical_transcript_id = t.transcript_id) WHERE g.biotype = 'protein_coding' AND t.biotype not in ('protein_coding', 'nonsense_mediated_decay')) AND t.biotype = 'protein_coding'";
if (dbre.getType() == DatabaseType.SANGER_VEGA) {// for sangervega ignore genes that do not have source havana or WU
sql += " and (g.source='havana' or g.source='WU')";
}
rows = DBUtils.getRowCount(con, sql);
if (rows > 0) {
result = false;
ReportManager.problem(this, con, rows + " genes with at least one protein_coding transcript do not have a protein_coding canonical transcript");
} else {
ReportManager.correct(this, con, "All genes with at least one protein_coding transcript have a protein_coding canonical transcript");
}
// --------------------------------
// If a gene is gene.biotype='protein_coding' but has no transcripts that are transcript.biotype='protein_coding', at least one
// of the transcripts has to have a translation.
sql = "SELECT count(distinct g.gene_id) FROM gene g JOIN transcript t USING (gene_id) JOIN translation p ON (t.canonical_translation_id = p.translation_id) WHERE g.biotype = 'protein_coding' AND g.gene_id NOT IN (SELECT gene_id FROM transcript WHERE biotype in ('protein_coding', 'nonsense_mediated_decay', 'non_stop_decay'))";
if (dbre.getType() == DatabaseType.SANGER_VEGA) {// for sangervega ignore genes that do not have source havana or WU
sql += " and (g.source='havana' or g.source='WU')";
}
rows = DBUtils.getRowCount(con, sql);
if (rows > 0) {
result = false;
ReportManager.problem(this, con, rows + " protein_coding gene(s) may potentially be missing translations.");
} else {
ReportManager.correct(this, con, "All protein_coding genes with no protein_coding transcripts have at least one transcripts which translates");
}
// --------------------------------
// check if protein_coding genes have a canonical transcript that has a valid translation
sql = "SELECT COUNT(*) FROM gene g LEFT JOIN translation tr ON g.canonical_transcript_id=tr.transcript_id WHERE g.biotype='protein_coding' AND tr.transcript_id IS NULL";
if (dbre.getType() == DatabaseType.SANGER_VEGA) {// for sangervega ignore genes that do not have source havana or WU
sql += " and (g.source='havana' or g.source='WU')";
}
rows = DBUtils.getRowCount(con, sql);
if (rows > 0) {
result = false;
ReportManager.problem(this, con, rows + " protein_coding genes have canonical transcripts that do not have valid translations");
} else {
ReportManager.correct(this, con, "All protein_coding genes have canonical_transcripts that translate");
}
// --------------------------------
// check that the number of canonical translations is correct
int numCanonical = DBUtils.getRowCount(con, "SELECT COUNT(*) FROM transcript t1, translation p, transcript t2 WHERE t1.canonical_translation_id = p.translation_id AND p.transcript_id = t2.transcript_id");
int numTotal = DBUtils.getRowCount(con, "SELECT COUNT(*) FROM translation p, transcript t WHERE t.transcript_id = p.transcript_id");
if (numCanonical != numTotal) {
result = false;
ReportManager.problem(this, con, "Number of canonical translations (" + numCanonical + ") is different from the total number of translations (" + numTotal + ")");
} else {
ReportManager.correct(this, con, "Number of canonical translations is correct.");
}
return result;
}
} // CanonicalTranscriptCoding