/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Copyright (C) 2003 EBI, GRL
*
* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.ensembl.healthcheck.testcase.generic;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.DatabaseType;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase;
import org.ensembl.healthcheck.util.DBUtils;
/**
* Compare the transcript stable IDs and exon coordinates between 2
* releases. Note this is not comparing counts so doesn't extend
* ComparePreviousVersionBase. Note this reads 2 complete exon sets
* into memory and so needs quite a bit of memory allocated. Suggest
* -Xmx1700m
*/
public class ComparePreviousVersionExonCoords extends SingleDatabaseTestCase {
/**
* Create a new testcase.
*/
public ComparePreviousVersionExonCoords() {
setDescription("Compare the transcript stable IDs and exon coordinates for each exon across releases to ensure that protein sequences are the same.");
setEffect("Causes problems for Compara if proteins are not identical");
setTeamResponsible(Team.CORE);
setSecondTeamResponsible(Team.GENEBUILD);
}
/**
* This test Does not apply to sangervega dbs
*/
public void types() {
removeAppliesToType(DatabaseType.SANGER_VEGA);
removeAppliesToType(DatabaseType.VEGA);
removeAppliesToType(DatabaseType.RNASEQ);
removeAppliesToType(DatabaseType.CDNA);
removeAppliesToType(DatabaseType.OTHERFEATURES);
}
// ----------------------------------------------------------------------
public boolean run(DatabaseRegistryEntry current) {
boolean result = true;
if (System.getProperty("ignore.previous.checks") != null) {
logger.finest("ignore.previous.checks is set in database.properties, skipping this test");
return true;
}
Connection currentCon = current.getConnection();
// skip databases where there's no previous one (e.g. new species)
DatabaseRegistryEntry previous = getEquivalentFromSecondaryServer(current);
if (previous == null) {
ReportManager.correct(this, currentCon, "Can't identify previous database - new species?");
return true;
}
Connection previousCon = previous.getConnection();
// and those where the genebuild version has changed - expect exon coords to change then
// if we can't get the genebuild version (due to a non-standard database name for example, check anyway)
int currentVersion = current.getNumericGeneBuildVersion();
int previousVersion = previous.getNumericGeneBuildVersion();
if (currentVersion > 0 && previousVersion > 0 && currentVersion != previousVersion) {
ReportManager.correct(this, currentCon, "Genebuild version has changed since " + previous.getName() + ", skipping");
return true;
}
// and those where the meta key genebuild.last_geneset_update has changed
if (!DBUtils.getMetaValue(currentCon, "genebuild.last_geneset_update").equals(DBUtils.getMetaValue(previousCon, "genebuild.last_geneset_update"))) {
ReportManager.correct(this, currentCon, "Meta entry genebuild.last_geneset_update has changed since " + previous.getName() + ", skipping");
return true;
}
// build hashes of transcript stable id:exon start:exon end for both databases
logger.finest("Building hash of current exon coords");
Map<String, String> currentHash = buildHash(currentCon);
logger.finest("Building hash of previous exon coords");
Map<String, String> previousHash = buildHash(previousCon);
// compare and store any differences
logger.finest("Comparing ...");
List<String> inNewNotOld = new ArrayList<String>();
for (String currentKey : currentHash.keySet()) {
// if it's not in the old one, make a note
if (!previousHash.containsKey(currentKey)) {
inNewNotOld.add(currentKey);
} else { // otherwise we're no longer interested, remove from both
previousHash.remove(currentKey);
}
}
// now previousHash will only contain keys that were in the old but not in the new
List<String> inOldNotNew = new ArrayList<String>(previousHash.keySet());
if (inNewNotOld.size() > 0 && inOldNotNew.size() == 0 ) {
ReportManager
.problem(this, currentCon, inNewNotOld.size() + " protein coding exons in " + current.getName() + " are not in " + previous.getName());
result = false;
}
if (inNewNotOld.size() == 0 && inOldNotNew.size() > 0 ) {
ReportManager
.problem(this, currentCon, inOldNotNew.size() + " protein coding exons in " + previous.getName() + " are not in " + current.getName());
result = false;
}
if (inNewNotOld.size() > 0 && inOldNotNew.size() > 0 ) {
ReportManager
.problem(this, currentCon, inOldNotNew.size() + " protein coding exons in " + previous.getName() + " have coordinates that are different from those in the same transcript in " + current.getName());
result = false;
}
if (inOldNotNew.size() == 0 && inNewNotOld.size() == 0) {
ReportManager.correct(this, currentCon, "All protein coding exons identical between databases");
}
return result;
}
// ----------------------------------------------------------------------
private Map<String, String> buildHash(Connection con) {
Map<String, String> hash = new HashMap<String, String>();
String sql = "SELECT CONCAT(t.stable_id, ':', e.seq_region_start, ':', e.seq_region_end) FROM transcript t, exon_transcript et, exon e WHERE t.transcript_id=et.transcript_id AND et.exon_id=e.exon_id AND t.biotype='protein_coding'";
try {
Statement stmt = con.createStatement();
ResultSet rs = stmt.executeQuery(sql);
while (rs.next()) {
hash.put(rs.getString(1), "1");
}
rs.close();
stmt.close();
} catch (SQLException e) {
System.err.println("Error executing " + sql);
e.printStackTrace();
}
return hash;
}
// ----------------------------------------------------------------------
} // ComparePreviousVersionExonCoords