/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ensembl.healthcheck.testcase.eg_core;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.List;
import java.util.Map;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.DatabaseType;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.util.CollectionUtils;
/**
* An EnsEMBL Healthcheck test case which checks that the protein_feature table
* agrees with the translation table. Modified for use with EnsemblGenomes to
* support seq_edits
*/
public class EgProteinFeatureTranslation extends AbstractEgCoreTestCase {
private static int THRESHOLD = 5; // don't report a problem if there are
// less results than this
private static int DISPLAY_LIMIT = 20;
/**
* Create an ProteinFeatureTranslationTestCase that applies to a specific
* set of databases.
*/
public EgProteinFeatureTranslation() {
super();
setFailureText("Large numbers of features longer than the translation indicate something is wrong. A few is probably OK");
setHintLongRunning(true);
}
/**
* This test only applies to core and Vega databases.
*/
public void types() {
removeAppliesToType(DatabaseType.OTHERFEATURES);
removeAppliesToType(DatabaseType.ESTGENE);
removeAppliesToType(DatabaseType.CDNA);
}
/**
* Builds a cache of the translation lengths, then compares them with the
* values in the protein_features table.
*
* @param dbre
* The database to use.
* @return Result.
*/
public boolean runTest(DatabaseRegistryEntry dbre) {
boolean result = true;
// get list of transcripts
String sql = "SELECT t.transcript_id, e.exon_id, tl.start_exon_id, "
+ " tl.translation_id, tl.end_exon_id, tl.seq_start, "
+ " tl.seq_end, e.seq_region_start, e.seq_region_end "
+ "FROM transcript t, exon_transcript et, exon e, translation tl "
+ "WHERE t.transcript_id = et.transcript_id "
+ "AND et.exon_id = e.exon_id "
+ "AND t.transcript_id = tl.transcript_id "
+ "ORDER BY t.transcript_id, et.rank";
String sqlSeqEdit = "SELECT ta.translation_id,ta.value FROM translation_attrib ta where ta.attrib_type_id=144";
try {
Connection con = dbre.getConnection();
// check that the protein feature table actually has some rows - if
// not there's
// no point working out the translation lengths
if (!tableHasRows(con, "protein_feature")) {
ReportManager.problem(this, con,
"protein_feature table is empty");
return false; // shoud we return true or false in this case?
}
// NOTE: By default the MM MySQL JDBC driver reads and stores *all*
// rows in the
// ResultSet.
// Since this TestCase is likely to produce lots of output, we must
// use the
// "streaming"
// mode where only one row of the ResultSet is stored at a time.
// To do this, the following two lines are both necessary.
// See the README file for the mm MySQL driver.
Statement stmt = con.createStatement(
java.sql.ResultSet.TYPE_FORWARD_ONLY,
java.sql.ResultSet.CONCUR_READ_ONLY);
stmt.setFetchSize(1000);
Map<Integer, Integer> translationLengths = CollectionUtils
.createHashMap();
// now calculate and store the translation lengths
ResultSet rs = stmt.executeQuery(sql);
rs.setFetchDirection(ResultSet.FETCH_FORWARD);
boolean inCodingRegion = false;
while (rs.next()) {
int currentTranslationID = rs.getInt("translation_id");
Integer id = new Integer(currentTranslationID);
// initialise if necessary
if (translationLengths.get(id) == null) {
translationLengths.put(id, new Integer(0));
}
if (!inCodingRegion) {
if (rs.getInt("start_exon_id") == rs.getInt("exon_id")) {
// single-exon-translations
if (rs.getInt("start_exon_id") == rs
.getInt("end_exon_id")) {
int length = (rs.getInt("seq_end") - rs
.getInt("seq_start")) + 1;
translationLengths.put(id, new Integer(length));
continue;
}
inCodingRegion = true;
// subtract seq_start
int currentLength = ((Integer) translationLengths
.get(id)).intValue();
currentLength -= (rs.getInt("seq_start") - 1);
translationLengths.put(id, new Integer(currentLength));
}
} // if !inCoding
if (inCodingRegion) {
if (rs.getInt("exon_id") == rs.getInt("end_exon_id")) {
// add seq_end
int currentLength = ((Integer) translationLengths
.get(id)).intValue();
currentLength += rs.getInt("seq_end");
translationLengths.put(id, new Integer(currentLength));
inCodingRegion = false;
} else {
int currentLength = ((Integer) translationLengths
.get(id)).intValue();
currentLength += (rs.getInt("seq_region_end") - rs
.getInt("seq_region_start")) + 1;
translationLengths.put(id, new Integer(currentLength));
// inCodingRegion = false;
}
} // if inCoding
} // while rs
rs.close();
stmt.close();
// modify according to seqedits
stmt = con.createStatement(java.sql.ResultSet.TYPE_FORWARD_ONLY,
java.sql.ResultSet.CONCUR_READ_ONLY);
stmt.setFetchSize(1000);
rs = stmt.executeQuery(sqlSeqEdit);
while (rs.next()) {
Integer translationId = rs.getInt(1);
String edit = rs.getString(2);
String[] vals = edit.split(" +");
if (!vals[0].equals(vals[1])) {
Integer len = (Integer) translationLengths
.get(translationId);
Integer insLen = new Integer(len + (3 * vals[2].length()));
translationLengths.put(translationId, insLen);
}
}
rs.close();
stmt.close();
stmt = null;
// Re-open the statement to make sure it's GC'd
stmt = con.createStatement(java.sql.ResultSet.TYPE_FORWARD_ONLY,
java.sql.ResultSet.CONCUR_READ_ONLY);
stmt.setFetchSize(1000);
logger.fine("Built translation length cache, about to look at protein features");
// dumpTranslationLengths(con, translationLengths, 100);
// find protein features where seq_end is > than the length of the
// translation
List<String> thisDBFeatures = CollectionUtils.createArrayList();
rs = stmt
.executeQuery("SELECT pf.protein_feature_id, pf.translation_id, pf.seq_end, a.logic_name, pf.hit_name "
+ "FROM protein_feature pf join analysis a using (analysis_id)");
while (rs.next()) {
Integer translationID = new Integer(rs.getInt("translation_id"));
Integer proteinFeatureID = new Integer(
rs.getInt("protein_feature_id"));
if (translationLengths.get(translationID) != null) {
// some codons can only be 2 bp ?!?
int maxTranslationLength = (((Integer) translationLengths
.get(translationID)).intValue() + 3) / 3;
int fl = rs.getInt("seq_end");
if (fl > maxTranslationLength) {
result = false;
String msg = "Protein feature " + proteinFeatureID
+ "(" + rs.getString(4) + "/" + rs.getString(5)
+ ") ends at " + fl + " which is beyond the "
+ maxTranslationLength
+ " length of the translation " + translationID;
thisDBFeatures.add(msg);
}
}
}
if (thisDBFeatures.size() > THRESHOLD) {
ReportManager
.problem(
this,
con,
"protein_feature table has "
+ thisDBFeatures.size()
+ " features that are longer than the translation");
int n = 0;
for (String msg : thisDBFeatures) {
if (n < DISPLAY_LIMIT) {
ReportManager.problem(this, con, msg);
} else if (n == DISPLAY_LIMIT
&& DISPLAY_LIMIT < thisDBFeatures.size()) {
ReportManager.problem(this, con, "... "
+ (thisDBFeatures.size() - DISPLAY_LIMIT)
+ " more problem translations remain");
}
n++;
}
} else if (thisDBFeatures.size() == 0) {
ReportManager
.correct(this, con,
"protein_feature_table has no features that are longer than the translation");
} else {
ReportManager
.correct(
this,
con,
"protein_feature_table has "
+ thisDBFeatures.size()
+ " features that are longer than the translation; this is less than the threshold of "
+ THRESHOLD);
}
rs.close();
stmt.close();
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
/* (non-Javadoc)
* @see org.ensembl.healthcheck.testcase.AbstractTemplatedTestCase#getEgDescription()
*/
@Override
protected String getEgDescription() {
return "Checks that the protein_feature table agrees with the translation table.";
}
} // ProteinFeatureTranslationTestCase