/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.generic; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.DatabaseType; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.Repair; import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase; import org.ensembl.healthcheck.util.DBUtils; /** * An EnsEMBL Healthcheck test case which checks that the protein_feature table agrees with the translation table. */ public class ProteinFeatureTranslation extends SingleDatabaseTestCase implements Repair { // hash of lists of protein features to delete // key - database name private Map featuresToDelete; private static int THRESHOLD = 1000; // don't report a problem if there are less results than this private static int OUTPUT_LIMIT = 20; // only complain about this many missing translations or long translations /** * Create an ProteinFeatureTranslationTestCase that applies to a specific set of databases. */ public ProteinFeatureTranslation() { featuresToDelete = new HashMap(); setFailureText("Large numbers of features longer than the translation indicate something is wrong. A few is probably OK"); setHintLongRunning(true); setTeamResponsible(Team.GENEBUILD); } /** * This test only applies to core and Vega databases. */ public void types() { removeAppliesToType(DatabaseType.OTHERFEATURES); removeAppliesToType(DatabaseType.ESTGENE); removeAppliesToType(DatabaseType.CDNA); removeAppliesToType(DatabaseType.RNASEQ); } /** * Builds a cache of the translation lengths, then compares them with the values in the protein_features table. * * @param dbre * The database to use. * @return Result. */ public boolean run(DatabaseRegistryEntry dbre) { boolean result = true; int problems = 0; // get list of transcripts String sql = "SELECT t.transcript_id, e.exon_id, tl.start_exon_id, " + " tl.translation_id, tl.end_exon_id, tl.seq_start, " + " tl.seq_end, e.seq_region_start, e.seq_region_end " + "FROM transcript t, exon_transcript et, exon e, translation tl " + "WHERE t.transcript_id = et.transcript_id " + "AND et.exon_id = e.exon_id " + "AND t.transcript_id = tl.transcript_id " + "ORDER BY t.transcript_id, et.rank"; try { Connection con = dbre.getConnection(); // check that the protein feature table actually has some rows - if // not there's // no point working out the translation lengths if (!tableHasRows(con, "protein_feature")) { ReportManager.problem(this, con, "protein_feature table is empty"); return false; // shoud we return true or false in this case? } // NOTE: By default the MM MySQL JDBC driver reads and stores *all* // rows in the // ResultSet. // Since this TestCase is likely to produce lots of output, we must // use the // "streaming" // mode where only one row of the ResultSet is stored at a time. // To do this, the following two lines are both necessary. // See the README file for the mm MySQL driver. Statement stmt = con.createStatement(java.sql.ResultSet.TYPE_FORWARD_ONLY, java.sql.ResultSet.CONCUR_READ_ONLY); stmt.setFetchSize(1000); Map translationLengths = new HashMap(); // now calculate and store the translation lengths ResultSet rs = stmt.executeQuery(sql); rs.setFetchDirection(ResultSet.FETCH_FORWARD); boolean inCodingRegion = false; while (rs.next()) { int currentTranslationID = rs.getInt("translation_id"); Integer id = new Integer(currentTranslationID); // initialise if necessary if (translationLengths.get(id) == null) { translationLengths.put(id, new Integer(0)); } if (!inCodingRegion) { if (rs.getInt("start_exon_id") == rs.getInt("exon_id")) { // single-exon-translations if (rs.getInt("start_exon_id") == rs.getInt("end_exon_id")) { int length = (rs.getInt("seq_end") - rs.getInt("seq_start")) + 1; translationLengths.put(id, new Integer(length)); continue; } inCodingRegion = true; // subtract seq_start int currentLength = ((Integer) translationLengths.get(id)).intValue(); currentLength -= (rs.getInt("seq_start") - 1); translationLengths.put(id, new Integer(currentLength)); } } // if !inCoding if (inCodingRegion) { if (rs.getInt("exon_id") == rs.getInt("end_exon_id")) { // add seq_end int currentLength = ((Integer) translationLengths.get(id)).intValue(); currentLength += rs.getInt("seq_end"); translationLengths.put(id, new Integer(currentLength)); inCodingRegion = false; } else { int currentLength = ((Integer) translationLengths.get(id)).intValue(); currentLength += (rs.getInt("seq_region_end") - rs.getInt("seq_region_start")) + 1; translationLengths.put(id, new Integer(currentLength)); // inCodingRegion = false; } } // if inCoding } // while rs rs.close(); stmt.close(); stmt = null; // Re-open the statement to make sure it's GC'd stmt = con.createStatement(java.sql.ResultSet.TYPE_FORWARD_ONLY, java.sql.ResultSet.CONCUR_READ_ONLY); // stmt.setFetchSize(1000); stmt.setFetchSize(Integer.MIN_VALUE); logger.fine("Built translation length cache, about to look at protein features"); // dumpTranslationLengths(con, translationLengths, 100); // find protein features where seq_end is > than the length of the // translation List thisDBFeatures = new ArrayList(); rs = stmt.executeQuery("SELECT protein_feature_id, translation_id, seq_end FROM protein_feature"); while (rs.next()) { Integer translationID = new Integer(rs.getInt("translation_id")); Integer proteinFeatureID = new Integer(rs.getInt("protein_feature_id")); if (translationLengths.get(translationID) != null) { // some codons can only be 2 bp int minTranslationLength = (((Integer) translationLengths.get(translationID)).intValue() + 2) / 3; // int minTranslationLength = ((Integer) // translationLengths.get(translationID)).intValue(); if (rs.getInt("seq_end") > minTranslationLength) { thisDBFeatures.add(proteinFeatureID); // System.out.println("proteinFeatureID: " + proteinFeatureID); } } else { if (problems++ < OUTPUT_LIMIT) { ReportManager.problem(this, con, "Protein feature " + proteinFeatureID + " refers to non-existent translation " + translationID); } } } featuresToDelete.put(DBUtils.getShortDatabaseName(con), thisDBFeatures); if (thisDBFeatures.size() > THRESHOLD) { ReportManager.problem(this, con, "protein_feature table has " + thisDBFeatures.size() + " features that are longer than the translation"); result = false; } else if (thisDBFeatures.size() == 0) { ReportManager.correct(this, con, "protein_feature table has no features that are longer than the translation"); } else { ReportManager.correct(this, con, "protein_feature table has " + thisDBFeatures.size() + " features that are longer than the translation; this is less than the threshold of " + THRESHOLD); } rs.close(); stmt.close(); if (problems >= OUTPUT_LIMIT) { ReportManager.problem(this, con, "Note that only " + OUTPUT_LIMIT + " missing translation IDs were notified, there may be more"); } } catch (Exception e) { e.printStackTrace(); } return result; } // ------------------------------------------ // Implementation of Repair interface. /** * Delete any protein features that run past the end of the translation. <strong>CAUTION! </strong>Actually deletes the features * from the protein_feature table. * * @param dbre * The database to use. */ public void repair(DatabaseRegistryEntry dbre) { Connection con = dbre.getConnection(); String sql = setupRepairSQL(con); if (sql.length() == 0) { System.out.println("No invalid protein features were found in " + DBUtils.getShortDatabaseName(con)); } else { try { Statement stmt = con.createStatement(); System.out.println(DBUtils.getShortDatabaseName(con)); System.out.println(sql); // stmt.execute(sql); stmt.close(); } catch (SQLException se) { se.printStackTrace(); } } } /** * Show which protein features would be deleted by the repair method. * * @param dbre * The database to use. */ public void show(DatabaseRegistryEntry dbre) { System.out.println("Candidate for repair:"); Connection con = dbre.getConnection(); String sql = setupRepairSQL(con); if (sql.length() == 0) { System.out.println("No invalid protein features were found in " + DBUtils.getShortDatabaseName(con)); } else { System.out.println(DBUtils.getShortDatabaseName(con) + ": " + sql); } } /** * Set up the SQL to delete the offending protein features. * * @param con * The database connection to use. * @return The SQL to delete the incorrect protein features, or "" if there are no problems. */ private String setupRepairSQL(Connection con) { StringBuffer sql = new StringBuffer("DELETE FROM protein_feature WHERE protein_feature_id IN ("); List thisDBFeatures = (List) featuresToDelete.get(DBUtils.getShortDatabaseName(con)); if (thisDBFeatures == null || thisDBFeatures.size() == 0) { return ""; } Iterator featureIterator = thisDBFeatures.iterator(); while (featureIterator.hasNext()) { sql.append(((Integer) featureIterator.next()).intValue()); if (featureIterator.hasNext()) { sql.append(","); } } sql.append(")"); return sql.toString(); } // ------------------------------------------------------------------------- // private void dumpTranslationLengths(Connection con, Map lengths, int maxID) { // // System.out.println("Translation lengths for " + DBUtils.getShortDatabaseName(con)); // // Set keySet = lengths.keySet(); // List keyList = new ArrayList(keySet); // Collections.sort(keyList, new IntegerComparator()); // // Iterator it = keyList.iterator(); // while (it.hasNext()) { // // Integer iid = (Integer) it.next(); // int id = iid.intValue(); // if (id > maxID) { // break; // } // Integer iLength = (Integer) lengths.get(iid); // int length = iLength.intValue(); // System.out.println("ID: " + id + "\tLength: " + length); // } // // } // ------------------------------------------------------------------------- } // ProteinFeatureTranslationTestCase