/*
* Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
* Copyright [2016-2017] EMBL-European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* File: DuplicateProteinFeatures.java
* Created by: dwilson
* Created on: Mar 4, 2013
* CVS: $$
*/
package org.ensembl.healthcheck.testcase.eg_core;
import org.ensembl.healthcheck.DatabaseRegistryEntry;
import org.ensembl.healthcheck.ReportManager;
/**
* Test to find where protein_features have been added twice
* @author dwilson
*
*/
public class DuplicateProteinFeature extends AbstractEgCoreTestCase {
private final static String DUPLICATE_XREF = "select count(*) from (select count(*) from xref x group by x.dbprimary_acc,x.external_db_id,x.info_type,x.info_text having count(*)>1) cc";
private final static String DUPLICATE_OBJ_XREF = "select count(*) from (select count(*) from xref x join object_xref ox using (xref_id) group by ox.ensembl_id, ox.ensembl_object_type,x.dbprimary_acc,x.external_db_id,x.info_type,x.info_text having count(*)>1) cc";
private final static String DUPLICATE_PF = "SELECT COUNT(*) FROM (SELECT COUNT(*) FROM protein_feature GROUP BY translation_id, seq_start, seq_end, hit_start, hit_end, hit_name, analysis_id, score, evalue, perc_ident HAVING COUNT(*)>1) cc;";
private final static String ALTERNATE_SQL = "SELECT COUNT(*) FROM protein_feature pf1, protein_feature pf2 "
+ "WHERE pf1.protein_feature_id != pf2.protein_feature_id AND "
+ "pf1.translation_id = pf2.translation_id AND "
+ "pf1.seq_start = pf2.seq_start AND "
+ "pf1.seq_end = pf2.seq_end AND "
+ "pf1.hit_start = pf2.hit_start AND "
+ "pf1.hit_end = pf2.hit_end AND "
+ "pf1.hit_name = pf2.hit_name AND "
+ "pf1.analysis_id = pf2.analysis_id AND "
+ "pf1.score = pf2.score AND "
+ "pf1.evalue = pf2.evalue AND "
+ "pf1.perc_ident = pf2.perc_ident;";
protected boolean runTest(DatabaseRegistryEntry dbre) {
boolean passes = true;
int nDupPF = getTemplate(dbre).queryForDefaultObject(DUPLICATE_PF, Integer.class);
if(nDupPF>0) {
passes = false;
ReportManager.problem(this, dbre.getConnection(), nDupPF+" duplicates found in protein_feature: "+DUPLICATE_PF);
ReportManager.problem(this, dbre.getConnection(), "Alternative useful SQL: "+ALTERNATE_SQL);
}
return passes;
}
/* (non-Javadoc)
* @see org.ensembl.healthcheck.testcase.AbstractTemplatedTestCase#getEgDescription()
*/
@Override
protected String getEgDescription() {
return "Test to find where protein_features have been added twice";
}
}