/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ensembl.healthcheck.testcase.generic;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.DatabaseType;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase;
import org.ensembl.healthcheck.util.DBUtils;
/**
* An EnsEMBL Healthcheck test case which checks all exon of a gene are on the
* same strand and in the correct order in their transcript.
*/
public class ExonStrandOrder extends SingleDatabaseTestCase {
private static final int TRANSCRIPT_WARN_LENGTH = 2000000;
/**
* Constructor.
*/
public ExonStrandOrder() {
addToGroup("post_genebuild");
setHintLongRunning(true);
setTeamResponsible(Team.GENEBUILD);
}
/**
* This only applies to core and Vega databases.
*/
public void types() {
removeAppliesToType(DatabaseType.OTHERFEATURES);
removeAppliesToType(DatabaseType.RNASEQ);
}
/**
* Check strand order of exons.
*
* @param dbre
* The database to use.
* @return Result.
*/
public boolean run(DatabaseRegistryEntry dbre) {
boolean result = true;
int singleExonTranscripts = 0, transcriptCount = 0;
String sql = "SELECT g.gene_id, g.seq_region_start, g.seq_region_end, g.seq_region_strand, tr.transcript_id, tr.seq_region_start, tr.seq_region_end, tr.seq_region_strand,e.exon_id, e.seq_region_start, e.seq_region_end, e.seq_region_strand, et.rank, g.stable_id, tr.stable_id "
+ "FROM gene g, transcript tr, exon_transcript et, exon e " + "WHERE e.exon_id = et.exon_id "
+ "AND et.transcript_id = tr.transcript_id " + "AND tr.gene_id = g.gene_id "
+ "AND tr.transcript_id NOT IN "
+ " (SELECT transcript_id FROM transcript_attrib INNER JOIN attrib_type USING (attrib_type_id) WHERE code='trans_spliced')"
+ "ORDER BY et.transcript_id, et.rank";
System.out.println(sql);
Connection con = dbre.getConnection();
Statement st = null;
ResultSet rs = null;
try {
st = con.createStatement(java.sql.ResultSet.TYPE_FORWARD_ONLY, java.sql.ResultSet.CONCUR_READ_ONLY);
st.setFetchSize(Integer.MIN_VALUE);
rs = st.executeQuery(sql);
long lastTranscriptID = -1;
long lastExonStart = -1;
long lastExonEnd = -1;
long lastExonStrand = -2;
long lastExonID = -1;
int lastExonRank = 0;
// ResultSet is ordered by transcript ID and rank, so we can loop
// through
// and look at the grouped exons for each transcript
while (rs.next()) {
long geneID = rs.getLong(1);
// long geneStart = rs.getLong(2);
// long geneEnd = rs.getLong(3);
// int geneStrand = rs.getInt(4);
long transcriptID = rs.getLong(5);
long transcriptStart = rs.getLong(6);
long transcriptEnd = rs.getLong(7);
int transcriptStrand = rs.getInt(8);
long exonID = rs.getLong(9);
long exonStart = rs.getLong(10);
long exonEnd = rs.getLong(11);
int exonStrand = rs.getInt(12);
int exonRank = rs.getInt(13);
String geneStableID = rs.getString(14);
String transcriptStableID = rs.getString(15);
if (transcriptID == lastTranscriptID) {
if (lastExonStrand < -1) {
// first exon in "new" transcript
lastExonStrand = exonStrand;
lastExonStart = exonStart;
lastExonEnd = exonEnd;
lastExonID = exonID;
lastExonRank = exonRank;
} else {
// check all exons in a transcript have the same strand
if (exonStrand != lastExonStrand) {
ReportManager.problem(this, con,
"Exons in transcript " + transcriptID + " have different strands");
result = false;
}
// check all exons have the same strand as their
// transcript
if (exonStrand != transcriptStrand) {
ReportManager.problem(this, con, "Exon " + exonID + " in transcript " + transcriptID
+ " has strand " + exonStrand + " but transcript's strand is " + transcriptStrand);
result = false;
}
// check that exon start/ends make sense
if (exonStrand == 1) {
if (lastExonEnd > exonStart) {
ReportManager.problem(this, con,
"Exons " + lastExonID + " (end " + lastExonEnd + ") and " + exonID + " (start "
+ exonStart + ") in transcript " + transcriptID
+ " appear to overlap (positive strand)");
result = false;
}
} else if (exonStrand == -1) {
if (lastExonStart < exonEnd) {
ReportManager.problem(this, con,
"Exons " + lastExonID + " (start " + lastExonStart + ") and " + exonID
+ " (end " + exonEnd + ") in transcript " + transcriptID
+ " appear to overlap (negative strand)");
result = false;
}
}
// check for rank jumping
if (exonRank - lastExonRank > 1) {
ReportManager.problem(this, con, "Exon rank jump in exon " + exonID + " transcript: "
+ transcriptID + " gene: " + geneID);
result = false;
}
// get ready for next exon
lastExonStrand = exonStrand;
lastExonStrand = exonStrand;
lastExonStart = exonStart;
lastExonEnd = exonEnd;
lastExonID = exonID;
lastExonRank = exonRank;
} // if first exon
} else {
// check for single-exon transcripts (highest rank = 1)
if (lastExonRank == 1) {
singleExonTranscripts++;
}
// next
lastTranscriptID = transcriptID;
lastExonStrand = -2;
lastExonStart = -1;
lastExonEnd = -1;
lastExonID = -1;
lastExonRank = 0;
transcriptCount++;
}
} // while rs
if ((double) singleExonTranscripts / transcriptCount > 0.2) {
ReportManager.warning(this, con,
"High single exon transcript count. (" + singleExonTranscripts + "/" + transcriptCount + ")");
}
} catch (Exception e) {
e.printStackTrace();
} finally {
DBUtils.closeQuietly(rs);
DBUtils.closeQuietly(st);
}
ReportManager.correct(this, con, "Exon strand order seems OK");
return result;
}
} // ExonStrandOrder