/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Copyright (C) 2003 EBI, GRL * * This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.ensembl.healthcheck.testcase.generic; import java.util.HashSet; import java.util.List; import java.util.ArrayList; import java.util.ListIterator; import java.util.Set; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.DatabaseType; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.Priority; import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase; import org.ensembl.healthcheck.util.DBUtils; import org.ensembl.healthcheck.util.SqlTemplate; /** * Check that the gene and transcript biotypes are consistent */ public class BiotypeGroups extends SingleDatabaseTestCase { /** * Constructor. */ public BiotypeGroups() { setDescription("Check that the gene and transcript biotypes are consistent."); setPriority(Priority.AMBER); setEffect("Unknown/incorrect biotypes."); setTeamResponsible(Team.GENEBUILD); } /** * This test Does not apply to sangervega dbs */ public void types() { removeAppliesToType(DatabaseType.SANGER_VEGA); } /** * Run the test. * * @param dbre * The database to use. * @return true if the test passed. * */ public boolean run(DatabaseRegistryEntry dbre) { boolean result = true; if (dbre.getType() == DatabaseType.CORE) { result &= checkGrouping(dbre); } return result; } private <T extends CharSequence> boolean checkGrouping(DatabaseRegistryEntry dbre) { String databaseType = dbre.getType().getName(); String[] table = {"gene"}; Set<String> geneBiotypes = getBiotypesDb(dbre, table); List<String> annotatorGenes = new ArrayList<String>(); if("vega".equals(databaseType) || "sangervega".equals(databaseType)) { annotatorGenes = getGenesWithAnnotatorBiotype(dbre); } ArrayList<String> transcriptErrors = new ArrayList<String>(); ArrayList<String> biotypeGroupErrors = new ArrayList<String>(); ArrayList<String> noGroupErrors = new ArrayList<String>(); ArrayList<String> nonCodingErrors = new ArrayList<String>(); ArrayList<String> pseudogeneErrors = new ArrayList<String>(); for (String geneBiotype : geneBiotypes) { Set<String> transcriptBiotypes = getBiotypesTranscript(dbre, geneBiotype); String[] transcripts = transcriptBiotypes.toArray(new String[0]); String[] genes = new String[] {geneBiotype}; Set<String> geneGrouping = getGrouping(dbre, genes, "gene", databaseType); Set<String> transcriptGrouping = getGrouping(dbre, transcripts, "transcript", databaseType); if (transcriptBiotypes.size() == 1 && !transcriptBiotypes.contains(geneBiotype) ) { transcriptErrors.add("Transcript biotype '" + transcriptBiotypes + "' does not match gene biotype '" + geneBiotype + "'"); } else if (transcriptGrouping.size() == 1 && !geneGrouping.equals(transcriptGrouping)) { biotypeGroupErrors.add("Genes of biotype '" + geneBiotype + "' should not have transcripts of mismatched group '" + transcriptGrouping + "'"); } else if (geneGrouping.contains("undefined") || geneGrouping.contains("non-coding")) { noGroupErrors.add("Genes of biotype '" + geneBiotype + "' should not have transcripts with biotypes in '" + transcriptBiotypes + "'"); } else if (geneGrouping.contains("pseudogene")) { if (transcriptGrouping.contains("coding") || transcriptGrouping.contains("undefined")) { nonCodingErrors.add("Some genes of biotype '" + geneBiotype + "' have transcripts in '" + transcriptBiotypes + "'"); } List<String> allGenes = getGene(dbre, geneBiotype, databaseType); List<String> goodGenes = getGeneWithTranscript(dbre, geneGrouping, databaseType); goodGenes.addAll(annotatorGenes); pseudogeneErrors.addAll( checkMissing(dbre, allGenes, goodGenes, geneBiotype) ); } else if (geneGrouping.contains("coding")) { List<String> allGenes = getGene(dbre, geneBiotype, databaseType); List<String> goodGenes = getGeneWithTranscript(dbre, geneGrouping, databaseType); goodGenes.addAll(annotatorGenes); pseudogeneErrors.addAll( checkMissing(dbre, allGenes, goodGenes, geneBiotype) ); if (geneBiotype.contains("polymorphic_pseudogene")) { allGenes = getGeneP(dbre, "polymorphic_pseudogene", databaseType); goodGenes = getGeneWithTranscriptP(dbre, "polymorphic_pseudogene", databaseType); goodGenes.addAll(annotatorGenes); pseudogeneErrors.addAll( checkMissing(dbre, allGenes, goodGenes, geneBiotype) ); } } } if ( processErrors(dbre, transcriptErrors) && processErrors(dbre,biotypeGroupErrors) && processErrors(dbre, noGroupErrors) && processErrors(dbre,nonCodingErrors) && processErrors(dbre,pseudogeneErrors)) { return true; } else { return false; } } private boolean processErrors(DatabaseRegistryEntry dbre, List<String> errorList) { ListIterator<String> errorIt = errorList.listIterator(); int i = 0; Boolean result = true; while (errorIt.hasNext()) { result = false; ReportManager.problem(this,dbre.getConnection(),errorIt.next()); i++; if (i == 10) { ReportManager.problem(this,dbre.getConnection(), errorList.size() + " similar errors found in total."); break; } } return result; } private ArrayList<String> checkMissing(DatabaseRegistryEntry dbre, List<String> allGenes, List<String> goodGenes, String biotype) { Set<String> missing = new HashSet<String>(allGenes); ArrayList<String> unhappyGenes = new ArrayList<String>(); missing.removeAll(goodGenes); for(CharSequence name: missing) { unhappyGenes.add(String.format("Gene '%s' of biotype '%s' has no transcript of same biotype group", name, biotype)); } return unhappyGenes; } private Set<String> getBiotypesDb(DatabaseRegistryEntry dbre, String[] tables) { SqlTemplate t = DBUtils.getSqlTemplate(dbre); Set<String> results = new HashSet<String>(); for (String table : tables) { String sql = "SELECT DISTINCT(biotype) FROM " + table; results.addAll(t.queryForDefaultObjectList(sql, String.class)); } return results; } private Set<String> getGrouping(DatabaseRegistryEntry dbre, String[] biotypes, String table, String databaseType) { SqlTemplate t = DBUtils.getSqlTemplate(getProductionDatabase()); Set<String> results = new HashSet<String>(); for (String biotype : biotypes) { String sql = "SELECT biotype_group FROM biotype WHERE object_type='" + table + "' AND is_current = 1 AND name ='" + biotype + "' AND FIND_IN_SET('" + databaseType + "', db_type) > 0"; results.addAll(t.queryForDefaultObjectList(sql, String.class)); } return results; } private Set<String> getBiotypeFromGrouping(DatabaseRegistryEntry dbre, Set<String> biotypeGroup, String table, String databaseType) { SqlTemplate t = DBUtils.getSqlTemplate(getProductionDatabase()); Set<String> results = new HashSet<String>(); for (String group : biotypeGroup) { String sql = "SELECT name FROM biotype WHERE object_type='" + table + "' AND is_current = 1 AND biotype_group ='" + group + "' AND FIND_IN_SET('" + databaseType + "', db_type) > 0"; results.addAll(t.queryForDefaultObjectList(sql, String.class)); } return results; } private Set<String> getBiotypesTranscript(DatabaseRegistryEntry dbre, String geneBiotype) { SqlTemplate t = DBUtils.getSqlTemplate(dbre); String sql = "SELECT DISTINCT(t.biotype) FROM transcript t, gene g WHERE g.biotype= '" + geneBiotype + "' AND g.gene_id = t.gene_id "; List<String> results = t.queryForDefaultObjectList(sql, String.class); return new HashSet<String>(results); } // private boolean checkHasTranscriptBiotype(DatabaseRegistryEntry dbre, Set<String> biotypeGroup, String gene) { // SqlTemplate t = DBUtils.getSqlTemplate(dbre); // String databaseType = dbre.getType().getName(); // boolean result = false; // Set<String> biotypes = getBiotypeFromGrouping(dbre, biotypeGroup, "transcript", databaseType); // String list = getListBiotypes(biotypes); // int rows = DBUtils.getRowCount(dbre.getConnection(), "SELECT COUNT(*) FROM transcript t, gene g where g.gene_id = t.gene_id and g.stable_id = '" + gene + "' and t.biotype in (" + list + ")"); // if (rows > 0){ // result = true; // } // return result; // } private List<String> getGene(DatabaseRegistryEntry dbre, String biotype, String databaseType) { SqlTemplate t = DBUtils.getSqlTemplate(dbre); String sql = "SELECT stable_id FROM gene where biotype = '" + biotype + "'"; return t.queryForDefaultObjectList(sql, String.class); } private List<String> getGeneP(DatabaseRegistryEntry dbre, String biotype, String databaseType) { SqlTemplate t = DBUtils.getSqlTemplate(dbre); String sql = "SELECT stable_id FROM gene where biotype = '" + biotype + "'"; return t.queryForDefaultObjectList(sql, String.class); } private List<String> getGeneWithTranscriptP(DatabaseRegistryEntry dbre, String biotype, String databaseType) { SqlTemplate t = DBUtils.getSqlTemplate(dbre); String sql = "SELECT g.stable_id from gene g, transcript t where g.gene_id = t.gene_id and t.biotype = '" + biotype + "' group by g.stable_id"; return t.queryForDefaultObjectList(sql, String.class); } private List<String> getGenesWithAnnotatorBiotype(DatabaseRegistryEntry dbre) { SqlTemplate t = DBUtils.getSqlTemplate(dbre); String sql = "select g.stable_id from gene g join gene_attrib using (gene_id) join attrib_type at using (attrib_type_id) where at.code = 'hidden_remark' and value like 'ASB_%' and CONCAT('ASB_',g.biotype) = value"; return t.queryForDefaultObjectList(sql,String.class); } private List<String> getGeneWithTranscript(DatabaseRegistryEntry dbre, Set<String> biotypeGroup, String databaseType) { SqlTemplate t = DBUtils.getSqlTemplate(dbre); Set<String> biotypesT = getBiotypeFromGrouping(dbre, biotypeGroup, "transcript", databaseType); Set<String> biotypesG = getBiotypeFromGrouping(dbre, biotypeGroup, "gene", databaseType); String listT = getListBiotypes(biotypesT); String listG = getListBiotypes(biotypesG); String sql = "SELECT g.stable_id from gene g, transcript t where g.gene_id = t.gene_id and g.biotype in (" + listG + ") and t.biotype in (" + listT + ") group by g.stable_id"; return t.queryForDefaultObjectList(sql, String.class); } private String getListBiotypes(Set<String> biotypes) { StringBuilder list = new StringBuilder(); for (String type : biotypes) { list.append("'"); list.append(type); list.append("',"); } list.append("''"); return list.toString(); } }