/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ensembl.healthcheck.testcase.generic;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.DatabaseType;
import org.ensembl.healthcheck.ReportManager;
import org.ensembl.healthcheck.Team;
import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase;
import org.ensembl.healthcheck.util.DBUtils;
import org.ensembl.healthcheck.util.SqlTemplate;
/**
* Check that all top-level seq regions have some gene density features, and that the values agree between the
* density_feature and seq_region attrib tables. Only checks top-level seq regions that do NOT have an _ in their names. Also checks
* that there are some density features for each analysis/density type. Also checks that there are no duplicates in the
* seq_region_attrib table.
*/
public class VariationDensity extends SingleDatabaseTestCase {
// max number of top-level seq regions to check
private static final int MAX_TOP_LEVEL = 100;
// map between analysis.logic_name and seq_region attrib_type.code
@SuppressWarnings("rawtypes")
private Map logicNameToAttribCode = new HashMap();
/**
* Create a new DensityFeatures testcase.
*/
@SuppressWarnings("unchecked")
public VariationDensity() {
setDescription("Check that all top-level seq regions have some variation density features, and that the values agree between the density_feature and seq_region attrib tables.");
setFailureText("If the genome has been assembled using short-read sequences, some seq_regions might not have density_features");
logicNameToAttribCode.put("SnpDensity", "SnpCount");
setTeamResponsible(Team.RELEASE_COORDINATOR);
}
// ----------------------------------------------------------------------
public void types() {
removeAppliesToType(DatabaseType.OTHERFEATURES);
removeAppliesToType(DatabaseType.ESTGENE);
removeAppliesToType(DatabaseType.EST);
removeAppliesToType(DatabaseType.CDNA);
removeAppliesToType(DatabaseType.VEGA);
removeAppliesToType(DatabaseType.SANGER_VEGA);
removeAppliesToType(DatabaseType.RNASEQ);
}
/**Integer.valueOf(
* Run the test.
*
* @param dbre
* The database to use.
* @return true if the test passed.
*
*/
@SuppressWarnings("unchecked")
public boolean run(DatabaseRegistryEntry dbre) {
boolean result = true;
Connection con = dbre.getConnection();
SqlTemplate t = DBUtils.getSqlTemplate(dbre);
boolean variationDatabaseExists = checkDatabaseExistsByType(dbre,DatabaseType.VARIATION);
if (!variationDatabaseExists) {
return result;
}
// Density features needed only for species with a karyotype
String sqlKaryotype = "SELECT count(*) FROM seq_region_attrib sa, attrib_type at WHERE at.attrib_type_id = sa.attrib_type_id AND code = 'karyotype_rank'";
int karyotype = t.queryForDefaultObject(sqlKaryotype, Integer.class);
if (karyotype == 0) {
return result;
}
result &= checkFeaturesAndCounts(con);
result &= checkAnalysisAndDensityTypes(dbre);
return result;
} // run
// ----------------------------------------------------------------------
@SuppressWarnings("rawtypes")
private boolean checkFeaturesAndCounts(Connection con) {
boolean result = true;
// get top level co-ordinate system ID
String sql = "SELECT coord_system_id FROM coord_system WHERE rank=1 LIMIT 1";
String s = DBUtils.getRowColumnValue(con, sql);
String logicName = "SnpDensity";
String attribCode = "SnpCount";
if (s.length() == 0) {
logger.warning("Error: can't get top-level co-ordinate system for " + DBUtils.getShortDatabaseName(con));
return false;
}
int topLevelCSID = Integer.parseInt(s);
try {
// check each top-level seq_region (up to a limit) to see how many density
// features there are
Statement stmt = con.createStatement();
ResultSet rs = stmt.executeQuery("SELECT s.seq_region_id, s.name, CASE WHEN ae.seq_region_id IS NULL THEN 0 ELSE 1 END as exception FROM seq_region_attrib sa, attrib_type at, seq_region s LEFT JOIN assembly_exception ae ON s.seq_region_id = ae.seq_region_id WHERE s.seq_region_id = sa.seq_region_id AND sa.attrib_type_id = at.attrib_type_id AND at.code = 'karyotype_rank' AND coord_system_id=" + topLevelCSID + " AND (exc_type IN ('HAP', 'PAR') or exc_type IS NULL) GROUP BY s.seq_region_id, s.name, exception");
int numTopLevel = 0;
int noDensity = 0;
while (rs.next() && numTopLevel++ < MAX_TOP_LEVEL) {
long seqRegionID = rs.getLong("s.seq_region_id");
String seqRegionName = rs.getString("s.name");
boolean assemblyException = rs.getBoolean("exception");
logger.fine("Counting density features on seq_region " + seqRegionName);
sql = "SELECT COUNT(*) FROM density_feature WHERE seq_region_id=" + seqRegionID;
int dfRows = DBUtils.getRowCount(con, sql);
if (dfRows == 0) {
noDensity++;
}
// check if this species has appropriate density features
int analRows = DBUtils.getRowCount(con, "SELECT COUNT(*) FROM analysis WHERE logic_name='" + logicName + "'");
if (analRows == 0) {
logger.fine(DBUtils.getShortDatabaseName(con) + " has no " + logicName + " analysis type, skipping checks for these features");
} else {
// check that the sum of the density_feature.density_value matches
// what
// is in the seq_region_attrib table
logger.fine("Comparing density_feature.density_value with seq_region_attrib for " + logicName + " features on " + seqRegionName);
sql = "SELECT SUM(df.density_value) FROM density_type dt, density_feature df, analysis a WHERE dt.density_type_id=df.density_type_id AND dt.analysis_id=a.analysis_id AND a.logic_name='"
+ logicName + "' AND seq_region_id=" + seqRegionID;
String sumDF = DBUtils.getRowColumnValue(con, sql);
// System.out.println(sql + " " + sumDF);
//don't check the sum for haplotypes or PAR regions
if (sumDF != null && sumDF.length() > 0 && !assemblyException) {
long sumFromDensityFeature = Long.parseLong(sumDF);
sql = "SELECT value FROM seq_region_attrib sra, attrib_type at WHERE sra.attrib_type_id=at.attrib_type_id AND at.code='" + attribCode + "' AND seq_region_id=" + seqRegionID;
String sumSRA = DBUtils.getRowColumnValue(con, sql);
// System.out.println(sql + " " + sumSRA);
if (sumSRA != null && sumSRA.length() > 0) {
long valueFromSeqRegionAttrib = Long.parseLong(sumSRA);
if (Math.abs(sumFromDensityFeature - valueFromSeqRegionAttrib) > 1000) { // allow a bit of leeway
ReportManager.problem(this, con, "Sum of values for " + logicName + " from density_feature (" + sumFromDensityFeature + ") doesn't agree with value from seq_region_attrib ("
+ valueFromSeqRegionAttrib + ") for " + seqRegionName);
result = false;
}
} // if sumSRA
if (sumSRA.length() == 0) {
ReportManager.problem(this, con, seqRegionName + " has no seq_region_attrib for " + attribCode);
result = false;
}
} // if sumDF
} // if rows
} // while rs.next
if (noDensity > 0) {
ReportManager.problem(this, con, noDensity + " of the " + MAX_TOP_LEVEL + " first toplevel regions have no density features");
result = false;
}
rs.close();
stmt.close();
if (numTopLevel == MAX_TOP_LEVEL) {
logger.warning("Only checked first " + numTopLevel + " seq_regions");
}
} catch (SQLException se) {
se.printStackTrace();
}
return result;
}
// ----------------------------------------------------------------------
/**
* Check that each analysis_id is used at least in one density_type.
*/
private boolean checkAnalysisAndDensityTypes(DatabaseRegistryEntry dbre) {
boolean result = true;
Connection con = dbre.getConnection();
String logicName = "SnpDensity";
String sql = "SELECT dt.density_type_id FROM analysis a, density_type dt WHERE a.analysis_id=dt.analysis_id AND a.logic_name='" + logicName + "'";
String[] rows = DBUtils.getColumnValues(con, sql);
if (rows.length == 0) {
result = false;
}
return result;
}
// ----------------------------------------------------------------------
} // DensityFeatures