/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.generic; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.util.HashSet; import java.util.List; import java.util.Scanner; import java.util.Set; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.DatabaseType; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase; import org.ensembl.healthcheck.util.DBUtils; import org.ensembl.healthcheck.util.RowMapper; import org.ensembl.healthcheck.util.SqlTemplate; /** * Check that certain seq_regions that have known, protein_coding genes * have the coding_cnt attribute associated with them. Also ensure * that _rna_edit attributes represent substitutions */ public class SeqRegionAttribsPresent extends SingleDatabaseTestCase { /** * Create a new SeqRegionAttribsPresent healthcheck. */ public SeqRegionAttribsPresent() { setDescription("Check that certain seq_regions that have protein_coding genes have the coding_cnt attribute associated with them. Also ensure that _rna_edit attributes represent substitutions"); setEffect("Website gene counts will be wrong and API will fail to load"); setFix("Re-run density generation pipeline or edit the database to remove the offending attribute"); setTeamResponsible(Team.RELEASE_COORDINATOR); } /** * This only really applies to core databases */ public void types() { removeAppliesToType(DatabaseType.OTHERFEATURES); removeAppliesToType(DatabaseType.ESTGENE); removeAppliesToType(DatabaseType.VEGA); removeAppliesToType(DatabaseType.CDNA); removeAppliesToType(DatabaseType.RNASEQ); } /** * Run the test. * * @param dbre * The database to check. * @return true if the test passes. */ public boolean run(final DatabaseRegistryEntry dbre) { boolean result = true; result &= checkCodingCountAttributes(dbre); result &= checkRnaEditAttributes(dbre); return result; } // run private boolean checkCodingCountAttributes(final DatabaseRegistryEntry dbre) { boolean result = true; Connection con = dbre.getConnection(); String code = (dbre.getType() == DatabaseType.SANGER_VEGA) ? "KnwnPCCount" : "coding_cnt"; SqlTemplate t = DBUtils.getSqlTemplate(dbre); String sql = "select distinct g.seq_region_id from gene g where g.biotype = ? and g.seq_region_id not in (select distinct g.seq_region_id from gene g, seq_region_attrib sa, attrib_type at where g.seq_region_id = sa.seq_region_id and sa.attrib_type_id = at.attrib_type_id and at.code in (?,?))" ; List<String> toplevel = t.queryForDefaultObjectList(sql, String.class, "protein_coding", "LRG", "non_ref"); sql = "select distinct g.seq_region_id from gene g, seq_region_attrib sa, attrib_type at where g.seq_region_id = sa.seq_region_id and sa.attrib_type_id = at.attrib_type_id and code =? "; List<String> known = t.queryForDefaultObjectList(sql, String.class, code); Set<String> missing = new HashSet<String>(toplevel); missing.removeAll(known); if (missing.isEmpty()) { ReportManager.correct(this, con, "All seq_regions with protein_coding genes have a coding_cnt attribute associated with them"); } else { String msg = String.format("%s regions with protein_coding genes do not have the coding_cnt attribute associated", missing.size()); ReportManager.problem(this, con, msg); result = false; } return result; } /** * Check that any _rna_edit attribute represents a substitution rather * than an insertion or deletion */ private boolean checkRnaEditAttributes(final DatabaseRegistryEntry dbre) { boolean ok = true; RowMapper<Attrib> mapper = new RowMapper<Attrib>() { public Attrib mapRow(ResultSet rs, int row) throws SQLException { return new Attrib(rs.getString(1), rs.getLong(2), rs.getString(3)); } }; String sql = "select sr.name, sr.seq_region_id, sra.value " + "from seq_region sr join seq_region_attrib sra using (seq_region_id) " + "join attrib_type at using (attrib_type_id) where at.code =?"; List<Attrib> attributes = getSqlTemplate(dbre).queryForList(sql, mapper, "_rna_edit"); for (Attrib a : attributes) { if (!a.isOk()) { ReportManager.warning(this, dbre.getConnection(), a.toString()); ok = false; } } if (!ok) { ReportManager.problem(this, dbre.getConnection(), "Detected sequence regions with incorrectly formatted _rna_edit attributes. Check warnings"); } return ok; } /** * Only to be used in this class */ private static class Attrib { private final String seqRegionName; private final Long seqRegionId; private final String attribute; private final Integer editStart; private final Integer editEnd; private final String editString; public Attrib(final String seqRegionName, final Long seqRegionId, final String attribute) { this.seqRegionName = seqRegionName; this.seqRegionId = seqRegionId; this.attribute = attribute; Scanner sc = new Scanner(attribute).useDelimiter("\\s+"); editStart = sc.nextInt(); editEnd = sc.nextInt(); editString = sc.next(); sc.close(); } public boolean isOk() { return editLength() == editStringLength(); } public int editLength() { return ((editEnd - editStart) + 1); } public int editStringLength() { return (editString != null) ? editString.length() : 0; } public String editType() { int editStringLength = editStringLength(); return (editStringLength == 0) ? "a deletion" : "an insertion"; } public String toString() { return "Sequence Region " + seqRegionName + " (" + seqRegionId + ") has an incorrectly formatted _rna_edit attribute '" + attribute + "'. Edit length was " + editLength() + " but insert string length was " + editStringLength() + ". Mostly likely " + editType(); } } // ----------------------------------------------------------------- } // SeqRegionAttribsPresent