/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.variation; import java.sql.Connection; import java.sql.ResultSet; import java.sql.Statement; import java.util.ArrayList; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.ReportManager; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.SingleDatabaseTestCase; import org.ensembl.healthcheck.util.DBUtils; /** * Check that allele frequencies add up to 1 */ public class AlleleFrequencies extends SingleDatabaseTestCase { /** * Creates a new instance of Check Allele Frequencies */ public AlleleFrequencies() { //addToGroup("variation-long"); setHintLongRunning(true); setDescription("Check that the allele frequencies add up to 1"); setTeamResponsible(Team.VARIATION); } /** * Check that all allele/genotype frequencies add up to 1 for the same variation/subsnp and sample. * * @param dbre * The database to use. * @return Result. */ public boolean run(DatabaseRegistryEntry dbre) { boolean result = true; Connection con = dbre.getConnection(); String[] tables = new String[] { "population_genotype", "allele" }; // Set this flag to true if we want to count ALL failed frequencies and not just break as soon as we've found one boolean countAll = false; // Tolerance for the deviation from 1.0 float tol = 0.025f; // Get the results in batches (determined by the variation_id) int chunk = 250000; // long mainStart = System.currentTimeMillis(); try { Statement stmt = con.createStatement(); // Get variations with allele/genotype frequencies that don't add up to 1 for the same variation_id, subsnp_id and population_id for (int i = 0; i < tables.length; i++) { // long subStart = System.currentTimeMillis(); // Get the maximum variation id String sql = "SELECT MAX(s.variation_id) FROM " + tables[i] + " s"; sql = DBUtils.getRowColumnValue(con, sql); if (sql.length() == 0) { sql = "0"; } int maxId = Integer.parseInt(sql); // The query to get the data sql = "SELECT s.variation_id, s.subsnp_id, s.population_id, s.frequency FROM " + tables[i] + " s USE INDEX (variation_idx,subsnp_idx) WHERE s.variation_id BETWEEN VIDLOWER AND VIDUPPER ORDER BY s.variation_id, s.subsnp_id, s.population_id"; int offset = 1; // Count the number of failed int failed = 0; // Keep the failed entries ArrayList failedEntries = new ArrayList(); // Loop until we've reached the maximum variation_id while (offset <= maxId) { // long s = System.currentTimeMillis(); // Replace the offsets in the SQL query ResultSet rs = stmt.executeQuery(sql.replaceFirst("VIDLOWER", String.valueOf(offset)).replaceFirst("VIDUPPER", String.valueOf(offset + chunk))); // long e = System.currentTimeMillis(); // System.out.println("Got " + String.valueOf(chunk) + " variations in " + String.valueOf(((e-s)/1000)) + // " seconds. Offset is " + String.valueOf(offset)); // Increase the offset with the chunk size and add 1 offset += chunk + 1; int lastVid = 0; int lastSSid = 0; int lastSid = 0; int curVid; int curSSid; int curSid; float freq; float sum = 1.f; int count = 0; while (rs.next()) { // Get the variation_id, subsnp_id, population_id and frequency. If any of these are NULL, they will be returned as 0 curVid = rs.getInt(1); curSSid = rs.getInt(2); curSid = rs.getInt(3); freq = rs.getFloat(4); // If any of the values was NULL, skip processing the row. For the frequency, we have to use the wasNull() function to // check this. The ids it is sufficient to check if they were 0 if (curVid != 0 && curSSid != 0 && curSid != 0 && !rs.wasNull()) { // If any of the ids is different from the last one, stop summing and check the sum of the latest variation if (curVid != lastVid || curSSid != lastSSid || curSid != lastSid) { // See if the sum of the frequencies deviates from 1 more than what we tolerate. In that case, count it as a failed if (Math.abs(1.f - sum) > tol) { // Store the failed data in failedEntries failedEntries.add(new int[] { lastVid, lastSSid, lastSid, Math.round(1000 * sum) }); failed++; } // Set the last ids to this one and reset the sum lastVid = curVid; lastSSid = curSSid; lastSid = curSid; sum = 0.f; } // Add the frequency to the sum sum += freq; } count++; // Break if we've encountered a failed frequency (unless flagged not to) if (failed > 0 && !countAll) { break; } } rs.close(); // s = System.currentTimeMillis(); // System.out.println("Processed " + String.valueOf(count) + " rows in " + String.valueOf(((s-e)/1000)) + " seconds"); } if (failed == 0) { // Report that the current table is ok ReportManager.correct(this, con, "Frequencies in " + tables[i] + " all add up to 1"); } else { // Get an example and print it int[] entry = (int[]) failedEntries.get(0); String example = "variation_id = " + String.valueOf(entry[0]) + ", subsnp_id = " + String.valueOf(entry[1]) + ", population_id = " + String.valueOf(entry[2]) + ", sum is " + String.valueOf((0.001f * entry[3])); ReportManager.problem(this, con, "There are " + String.valueOf(failed) + " variations in " + tables[i] + " where the frequencies don't add up to 1 +/- " + String.valueOf(tol) + " (e.g. " + example + ")"); result = false; // Loop over the failed entries and print a list of variation_id, subsnp_id, population_id and summed frequency to stdout /* for (int j = 0; j < failedEntries.size(); j++) { entry = (int[]) failedEntries.get(j); System.out.println(String.valueOf(entry[0]) + "\t" + String.valueOf(entry[1]) + "\t" + String.valueOf(entry[2]) + "\t" + String.valueOf((0.001f * entry[3]))); } */ } // long subEnd = System.currentTimeMillis(); // System.out.println("Time for healthcheck on " + tables[i] + " (~" + String.valueOf(maxId) + " variations) was " + // String.valueOf(((subEnd-subStart)/1000)) + " seconds"); } stmt.close(); } catch (Exception e) { result = false; e.printStackTrace(); } // long mainEnd = System.currentTimeMillis(); // System.out.println("Total time for healthcheck was " + String.valueOf(((mainEnd-mainStart)/1000)) + " seconds"); if (result) { ReportManager.correct(this, con, "Allele/Genotype frequency healthcheck passed without any problem"); } return result; } // run } // AlleleFrequencies