/* * Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute * Copyright [2016-2017] EMBL-European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ensembl.healthcheck.testcase.compara; import static java.lang.String.format; import static org.apache.commons.lang.StringUtils.join; import static org.ensembl.healthcheck.ReportManager.problem; import static org.ensembl.healthcheck.util.CollectionUtils.createArrayList; import java.util.List; import java.util.Map; import org.ensembl.healthcheck.DatabaseRegistryEntry; import org.ensembl.healthcheck.Team; import org.ensembl.healthcheck.testcase.AbstractTemplatedTestCase; import org.ensembl.healthcheck.util.DefaultMapRowMapper; import org.ensembl.healthcheck.util.MapRowMapper; import org.ensembl.healthcheck.util.SqlTemplate; /** * Test case used to look for ProteinTrees where we have leaves whose parent * node is the root, we have more than 1 and there is an internal tree structure * therefore this looks to be a very suspect tree. We also try to look for flat * trees where all members have the root as their parent and have more than * getMaxAllowedFlatMembers() in this count. * * @author ayates * @author $Author$ * @version $Revision$ */ public class CheckFlatProteinTrees extends AbstractTemplatedTestCase { public CheckFlatProteinTrees() { setDescription("Look for trees which have internal nodes but all members' parent is the root"); setTeamResponsible(Team.COMPARA); } /** * Returns 3 which is the maximum number of elements we allow in a tree before * considering it to be <em>dodgy</em> */ protected int getMaxAllowedFlatMembers() { return 2; } @Override protected boolean runTest(DatabaseRegistryEntry dbre) { boolean passed = true; SqlTemplate template = getTemplate(dbre); MapRowMapper<Long, Long> mapRowMapper = new DefaultMapRowMapper<Long, Long>( Long.class, Long.class); // Counts all nodes which do not represent a seq_member, do not share the same // id // as their root and whose root id is not 0. Groups this by the root_id String internalNodeCountSql = "SELECT gtn.root_id, count(*) AS internal_nodes FROM gene_tree_node gtn WHERE gtn.seq_member_id IS NULL AND gtn.node_id <> gtn.root_id AND gtn.root_id <> 0 GROUP BY gtn.root_id"; // Counts all members per tree where root id is not 0, whose parent id is // the same as the root id but have more than one of these per tree String flatMemberCountSql = "SELECT gtn.root_id, count(*) AS root_members FROM gene_tree_node gtn WHERE gtn.seq_member_id IS NOT NULL AND gtn.parent_id = gtn.root_id AND gtn.root_id <> 2 GROUP BY gtn.root_id having root_members > 1"; // Count all members where root is not 0 String memberCountSql = "SELECT root_id, count(distinct root_id) from gene_tree_node where root_id <> 0 group by root_id"; // Selects all the non-rooted trees String nonrootedTreesSql = "SELECT root_id, 1 FROM gene_tree_root WHERE tree_type = 'tree' AND clusterset_id LIKE '%\\_it\\_%' AND clusterset_id NOT LIKE 'pg\\_it\\_%' "; Long totalTreesCount = template.queryForDefaultObject( "select count(*) from gene_tree_root", Long.class); Map<Long, Long> internalNodeCounts = template.queryForMap( internalNodeCountSql, mapRowMapper); Map<Long, Long> flatMemberCounts = template.queryForMap(flatMemberCountSql, mapRowMapper); Map<Long, Long> memberCounts = template.queryForMap(memberCountSql, mapRowMapper); Map<Long, Long> nonrootedTrees = template.queryForMap(nonrootedTreesSql, mapRowMapper); List<Long> flatMembersWithInternalStructure = createArrayList(); List<Long> flatTreesStructure = createArrayList(); for (Map.Entry<Long, Long> entry : flatMemberCounts.entrySet()) { // If we have an entry then we have a suspect tree Long nodeId = entry.getKey(); Long internalNodeCount = internalNodeCounts.get(nodeId); if (internalNodeCount != null) { if (! nonrootedTrees.containsKey(nodeId)) { problem(this, dbre.getConnection(), format("%d has a problem: %d", nodeId, nonrootedTrees.get(nodeId))); flatMembersWithInternalStructure.add(nodeId); } } else if (entry.getValue() > getMaxAllowedFlatMembers() && entry.getValue().equals(memberCounts.get(nodeId))) { flatTreesStructure.add(nodeId); } } if (!flatMembersWithInternalStructure.isEmpty()) { reportProblem(dbre, flatMembersWithInternalStructure, totalTreesCount, "have more than one seq_member joined to the root and a well formed internal tree structure"); passed = false; } if (!flatTreesStructure.isEmpty()) { reportProblem(dbre, flatTreesStructure, totalTreesCount, "are flat with more than 2 members"); passed = false; } return passed; } private void reportProblem(DatabaseRegistryEntry dbre, List<Long> ids, Long totalTrees, String customMessage) { String badIdsString = join(ids, ','); String msg = format( "%d trees out of %d %s. Suspect root ids are: [%s]", ids.size(), totalTrees, customMessage, badIdsString); problem(this, dbre.getConnection(), msg); } }