/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Copyright (C) 2012 EBI, GRL
*
* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.ensembl.healthcheck.testcase.variation;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Properties;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.DatabaseType;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.Species;
import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase;
/**
* Count the number of compressed genotype region entries for the population 1000 Genomes CEU in the current database, by chromosome.
*/
public class CompressedGenotypeRegion extends SingleDatabaseTestCase {
// The name of one of the 1000 Genomes sub-population
private static String POP_NAME = "1000GENOMES:phase_1_CEU";
// The minimum of genotype entries for a 1000 Genomes sub-population (e.g. CEU) in the compressed_genotype_region table
// The number correspond to a number slightly lower than the number of entries for the chromosome 22.
private static final int MIN_GENOTYPE = 70000;
private String msg = "none";
/**
* Creates a new instance of CompressedGenotypeRegion TestCase
*/
public CompressedGenotypeRegion() {
addToGroup("variation-release");
setDescription("Checks that the compressed_genotype_region table is valid for the 1000 Genomes data");
setTeamResponsible(Team.VARIATION);
}
// ---------------------------------------------------------------------
/**
* Store the SQL queries in a Properties object.
*/
private Properties getSQLQueries() {
// Store all the needed SQL statements in a Properties object
Properties sqlQueries = new Properties();
String query;
// Query getting the id of a population
query = "SELECT population_id FROM population vs WHERE name = ? LIMIT 1";
sqlQueries.setProperty("popId", query);
// Query counting the number of genotypes of a population, by chromosome
query = "SELECT s.name, COUNT(*) FROM compressed_genotype_region c, seq_region s WHERE s.seq_region_id=c.seq_region_id AND c.sample_id IN (SELECT sample_id FROM sample_population WHERE population_id=?) GROUP BY s.seq_region_id";
sqlQueries.setProperty("genotype_region", query);
return sqlQueries;
}
// ---------------------------------------------------------------------
/**
* Check that the variation set data makes sense and has a valid tree structure.
*
* @param dbre
* The database to check.
* @return true if the test passed.
*/
public boolean run(DatabaseRegistryEntry dbre) {
boolean result = true;
Species species = dbre.getSpecies();
if (species == Species.HOMO_SAPIENS) {
Connection con = dbre.getConnection();
Properties sqlQueries = getSQLQueries();
int count = 0;
int pop_id = 0;
try {
PreparedStatement pStmt1 = con.prepareStatement(sqlQueries.getProperty("popId"));
pop_id = getPopulationId(pStmt1);
} catch (Exception e) {
ReportManager.problem(this, con, "HealthCheck caused an exception: " + e.getMessage());
}
try {
PreparedStatement pStmt2 = con.prepareStatement(sqlQueries.getProperty("genotype_region"));
count = countGenotypeByRegion(pStmt2,pop_id);
} catch (Exception e) {
ReportManager.problem(this, con, "HealthCheck caused an exception: " + e.getMessage());
}
if (count > 0) {
result = false;
ReportManager.problem(this, con, "There are " + String.valueOf(count)
+ " region(s) ("+msg+") with a low number of genotypes for the 1000 Genomes population "+POP_NAME+" in the table compressed_genotype_region");
}
}
return result;
}
// -----------------------------------------------------------------
private int getPopulationId(PreparedStatement pStmt) throws Exception {
pStmt.setString(1, POP_NAME);
ResultSet rs = pStmt.executeQuery();
int pop_id = 0;
if (rs.next()) {
pop_id = rs.getInt(1);
}
return pop_id;
} // getPopulationId
// -----------------------------------------------------------------
private int countGenotypeByRegion(PreparedStatement pStmt, int pop_id) throws Exception {
pStmt.setInt(1, pop_id);
ResultSet rs = pStmt.executeQuery();
int count = 0;
String region_name;;
int count_genotype = 0;
while (rs.next()) {
region_name = rs.getString(1);
count_genotype = rs.getInt(2);
// exclude short sequences from this check
if (count_genotype < MIN_GENOTYPE && !region_name.equals("MT") && !region_name.matches(".*PATCH") && !region_name.matches("HSCHR.*")) {
count++;
if (msg.equals("none")) {
msg=region_name;
} else {
msg+=", "+region_name;;
}
}
}
return count;
} // countGenotypeByRegion
}