/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ensembl.healthcheck.testcase.generic;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.DatabaseType;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase;
import org.ensembl.healthcheck.util.Utils;
/**
* Check for multiple components which overlap and are assembled to the same thing. Note that multiple assembly is OK, overlapping
* components is not.
*/
public class AssemblyMultipleOverlap extends SingleDatabaseTestCase {
private static final int MAX = 10; // maximum number of overlaps to print
/**
* Creates a new instance of AssemblyMultipleOverlap.
*/
public AssemblyMultipleOverlap() {
setDescription("Check for multiple components which overlap and are assembled to the same thing.");
setTeamResponsible(Team.GENEBUILD);
}
/**
* Data is only tested in core database, as the tables are in sync
*/
public void types() {
removeAppliesToType(DatabaseType.OTHERFEATURES);
removeAppliesToType(DatabaseType.ESTGENE);
removeAppliesToType(DatabaseType.RNASEQ);
removeAppliesToType(DatabaseType.CDNA);
}
/**
* Run the test.
*
* @param dbre
* The database to use.
* @return true if the test passed.
*
*/
public boolean run(DatabaseRegistryEntry dbre) {
boolean result = true;
Connection con = dbre.getConnection();
// get list of all multiply-assembled components
String sql = "SELECT sr1.name AS cmp_sr_name, cs1.name AS cmp_cs, sr2.name AS asm_sr_name, cs2.name AS asm_cs, a.asm_seq_region_id, a.cmp_seq_region_id, COUNT(*) AS count "
+ "FROM assembly a, seq_region sr1, seq_region sr2, coord_system cs1, coord_system cs2 " + "WHERE a.cmp_seq_region_id = sr1.seq_region_id AND a.asm_seq_region_id = sr2.seq_region_id "
+ "AND sr1.coord_system_id = cs1.coord_system_id AND sr2.coord_system_id = cs2.coord_system_id " + "GROUP BY asm_seq_region_id, cmp_seq_region_id, asm_start, cmp_start, ori HAVING count > 1;";
int overlapCount = 0;
try {
Statement stmt = con.createStatement();
ResultSet rs = stmt.executeQuery(sql);
PreparedStatement cmpStmt = con.prepareStatement("SELECT asm_start, asm_end, ori FROM assembly WHERE asm_seq_region_id=? AND cmp_seq_region_id=? ORDER BY asm_start");
while (rs.next()) {
long asm_seq_region_id = rs.getLong("asm_seq_region_id");
long cmp_seq_region_id = rs.getLong("cmp_seq_region_id");
// get start, end for each component
cmpStmt.setLong(1, asm_seq_region_id);
cmpStmt.setLong(2, cmp_seq_region_id);
ResultSet cmpRS = cmpStmt.executeQuery();
// read all start/end/strand
List startsL = new ArrayList();
List endsL = new ArrayList();
List strandsL = new ArrayList();
while (cmpRS.next()) {
startsL.add(cmpRS.getLong("asm_start"));
endsL.add(cmpRS.getLong("asm_end"));
strandsL.add(cmpRS.getLong("ori"));
}
cmpRS.close();
// convert to arrays - easier to keep track
long starts[] = Utils.listToArrayLong(startsL);
long ends[] = Utils.listToArrayLong(endsL);
long strands[] = Utils.listToArrayLong(strandsL);
// check pairs for overlaps
// note ORDER BY asm_start means we have less comparisons to do
for (int i = 0; i < starts.length; i++) {
for (int j = i + 1; j < starts.length; j++) {
if (strands[i] == strands[j]) {
if (starts[j] < ends[i]) {
overlapCount++;
if (overlapCount < MAX) {
// System.out.println("Overlap: cmp " + cmp_seq_region_id + " asm " + asm_seq_region_id + " " + starts[i] + " " +
// ends[i] + " " + starts[j] + " " + ends[j]);
}
}
}
}
}
}
rs.close();
} catch (Exception e) {
result = false;
e.printStackTrace();
}
if (overlapCount > 0) {
ReportManager.problem(this, con, overlapCount + " instances of multiple overlapping assembled components");
result = false;
} else {
ReportManager.correct(this, con, "No multiply-assembled overlapping components");
}
return result;
} // run
// -------------------------------------------------------------------------
} // AssemblyMultipleOverlap