/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ensembl.healthcheck.testcase.generic;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.List;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.DatabaseType;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.Species;
import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase;
import org.ensembl.healthcheck.util.DBUtils;
import org.ensembl.healthcheck.util.SqlTemplate;
/**
* Check that gencode basic attributes are present
*/
public class AttribValues extends SingleDatabaseTestCase {
/**
* Create a new testcase.
*/
public AttribValues() {
setDescription("Check that some attributes have been added (currently, tsl, appris, gencode and refseq attributes)");
setTeamResponsible(Team.GENEBUILD);
}
/**
* Only applies to core dbs.
*/
public void types() {
List types = new ArrayList();
types.add(DatabaseType.CORE);
types.add(DatabaseType.PRE_SITE);
setAppliesToTypes(types);
}
/**
* Run the test.
*
* @param dbre
* The database to use.
* @return true if the test passed.
*
*/
public boolean run(DatabaseRegistryEntry dbre) {
boolean result = true;
Connection con = dbre.getConnection();
if (dbre.getSpecies() != Species.HOMO_SAPIENS && dbre.getSpecies() != Species.MUS_MUSCULUS &&
dbre.getSpecies() != Species.RATTUS_NORVEGICUS && dbre.getSpecies() != Species.SUS_SCROFA &&
dbre.getSpecies() != Species.DANIO_RERIO) {
return result;
}
// Gencode and TSL attributes are only for human and mouse
if (dbre.getSpecies() == Species.HOMO_SAPIENS || dbre.getSpecies() == Species.MUS_MUSCULUS) {
result &= gencodeAttrib(con);
result &= tslAttrib(dbre);
}
result &= refseqAttrib(con);
result &= apprisAttrib(dbre);
return result;
}
protected boolean gencodeAttrib (Connection con) {
boolean result = true;
int gencodeGenes = DBUtils.getRowCount(con, "SELECT COUNT(distinct gene_id) FROM transcript t, attrib_type at, transcript_attrib ta WHERE t.transcript_id = ta.transcript_id AND at.attrib_type_id=ta.attrib_type_id AND at.code='gencode_basic'");
int rows = DBUtils.getRowCount(con, "SELECT COUNT(distinct gene_id) FROM transcript WHERE biotype NOT IN ('LRG_gene')");
if (rows > gencodeGenes) {
ReportManager.problem(this, con, (rows - gencodeGenes) + " genes do not have any transcripts with the gencode_basic attribute\n");
result = false;
} else {
ReportManager.correct(this, con, rows + " gencode basic transcript attributes found");
}
return result;
}
protected boolean refseqAttrib (Connection con) {
boolean result = true;
int genes = DBUtils.getRowCount(con, "SELECT COUNT(distinct g.gene_id) FROM gene g, seq_region s, coord_system cs WHERE g.seq_region_id = s.seq_region_id AND " +
"s.coord_system_id = cs.coord_system_id AND cs.name = 'chromosome' AND cs.attrib = 'default_version' AND s.name NOT LIKE 'LRG%' " +
"AND s.name != 'MT' AND s.seq_region_id NOT IN (SELECT seq_region_id FROM assembly_exception WHERE exc_type in ('PATCH_NOVEL', 'PATCH_FIX', 'HAP'))");
int refseqGenes = DBUtils.getRowCount(con, "SELECT COUNT(distinct g.gene_id) FROM gene g, seq_region s, coord_system cs, gene_attrib ga, attrib_type at WHERE g.seq_region_id = s.seq_region_id AND " +
"s.coord_system_id = cs.coord_system_id AND cs.name = 'chromosome' AND cs.attrib = 'default_version' AND s.name NOT LIKE 'LRG%' " +
"AND s.name != 'MT' AND s.seq_region_id NOT IN (SELECT seq_region_id FROM assembly_exception WHERE exc_type in ('PATCH_NOVEL', 'PATCH_FIX', 'HAP')) " +
"AND g.seq_region_id = s.seq_region_id AND ga.gene_id = g.gene_id AND ga.attrib_type_id = at.attrib_type_id AND code = 'refseq_compare'");
if (genes > refseqGenes) {
ReportManager.problem(this, con, (genes - refseqGenes) + " genes do not have the refseq_compare attribute");
result = false;
} else {
ReportManager.correct(this, con, refseqGenes + " genes found with refseq_compare attribute");
}
return result;
}
protected boolean apprisAttrib (DatabaseRegistryEntry dbre) {
boolean result = true;
Connection con = dbre.getConnection();
SqlTemplate t = DBUtils.getSqlTemplate(dbre);
String chromosomeSql = "SELECT DISTINCT s.name FROM seq_region s, seq_region_attrib sa, attrib_type at WHERE s.seq_region_id = sa.seq_region_id AND sa.attrib_type_id = at.attrib_type_id AND code = 'karyotype_rank'";
String codingSql = "SELECT count(distinct g.stable_id) FROM gene g, seq_region s WHERE g.seq_region_id = s.seq_region_id AND s.name = ? AND biotype = 'protein_coding'";
String apprisSql = "SELECT count(distinct g.stable_id) FROM gene g, seq_region s, transcript t, transcript_attrib ta, attrib_type a WHERE g.seq_region_id = s.seq_region_id AND s.name = ? AND g.biotype = 'protein_coding' AND g.gene_id=t.gene_id AND t.transcript_id=ta.transcript_id AND ta.attrib_type_id=a.attrib_type_id AND a.code like 'appris%'";
// If no data available at all, exit early
String hasApprisSql = "SELECT count(*) FROM transcript t, transcript_attrib ta, attrib_type a WHERE t.transcript_id = ta.transcript_id AND ta.attrib_type_id = a.attrib_type_id AND code like 'appris%'";
int hasAppris = DBUtils.getRowCount(con, hasApprisSql);
if (hasAppris == 0) {
ReportManager.problem(this, con, "No appris attributes found, have you imported the new data?");
return false;
}
List<String> chromosomes = t.queryForDefaultObjectList(chromosomeSql, String.class);
for (String chromosome: chromosomes) {
int codingCount = t.queryForDefaultObject(codingSql, Integer.class, chromosome);
int apprisCount = t.queryForDefaultObject(apprisSql, Integer.class, chromosome);
if (apprisCount < codingCount * 0.95) {
ReportManager.problem(this, con, chromosome + " has " + codingCount + " protein coding genes but only " + apprisCount + " have a transcript-attrib like 'appris%'");
result = false;
}
}
if (result) {
ReportManager.correct(this, con, "Found correct number of Appris attributes on all chromosomes");
}
return result;
}
protected boolean tslAttrib (DatabaseRegistryEntry dbre) {
boolean result = true;
Connection con = dbre.getConnection();
SqlTemplate t = DBUtils.getSqlTemplate(dbre);
String chromosomeSql = "SELECT DISTINCT s.name FROM seq_region s, seq_region_attrib sa, attrib_type at WHERE s.seq_region_id = sa.seq_region_id AND sa.attrib_type_id = at.attrib_type_id AND code = 'karyotype_rank'";
String transcriptSql = "SELECT count(distinct t.stable_id) from seq_region s, transcript t WHERE t.seq_region_id = s.seq_region_id and s.name = ?";
String tslSql = "SELECT count(distinct t.stable_id) from seq_region s, transcript t, transcript_attrib ta, attrib_type a WHERE t.seq_region_id = s.seq_region_id AND t.transcript_id = ta.transcript_id AND ta.attrib_type_id = a.attrib_type_id AND a.code like 'tsl%' AND s.name = ?";
// If no data available, exit early
String hasTslSql = "SELECT count(*) FROM transcript t, transcript_attrib ta, attrib_type a WHERE t.transcript_id = ta.transcript_id AND ta.attrib_type_id = a.attrib_type_id AND code like 'tsl%'";
int hasTsl = DBUtils.getRowCount(con, hasTslSql);
if (hasTsl == 0) {
ReportManager.problem(this, con, "No tsl attributes found, have you imported the new data?");
return false;
}
String patchSql = "SELECT count(*) FROM transcript t, assembly_exception ax, transcript_attrib ta, attrib_type a " +
"WHERE t.seq_region_id = ax.seq_region_id AND t.transcript_id = ta.transcript_id AND ta.attrib_type_id = a.attrib_type_id AND code like 'tsl%'";
int patchCount = DBUtils.getRowCount(con, patchSql);
ReportManager.info(this, con, "There are " + patchCount + " transcripts with TSL attributes on patches");
List<String> chromosomes = t.queryForDefaultObjectList(chromosomeSql, String.class);
for (String chromosome: chromosomes) {
int transcriptCount = t.queryForDefaultObject(transcriptSql, Integer.class, chromosome);
int tslCount = t.queryForDefaultObject(tslSql, Integer.class, chromosome);
if (tslCount < transcriptCount * 0.95) {
ReportManager.problem(this, con, chromosome + " has " + transcriptCount + " transcripts but only " + tslCount + " have a transcript-attrib like 'tsl%'");
result = false;
}
}
if (result) {
ReportManager.correct(this, con, "Found correct number of TSL attributes on all chromosomes");
}
return result;
}
// ----------------------------------------------------------------------
} // AttribValues