/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.dataimport.genes.ontologyBuilder; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.erasmusmc.collections.CountingSet; import org.erasmusmc.collections.OneToManyList; import org.erasmusmc.collections.OneToManySet; import org.erasmusmc.ids.DatabaseID; import org.erasmusmc.ontology.ontologyutilities.OntologyUtilities; import org.erasmusmc.utilities.ReadTextFile; import org.erasmusmc.utilities.StringUtilities; public class HomologeneMerger { private String homologeneFilename; private Set<Integer> taxonIDs; private int recipCount = 0; private int overlapCount = 0; public static void main(String[] args){ Set<Integer> allowedTaxonIDs = new HashSet<Integer>(); allowedTaxonIDs.add(9606); // H sapiens allowedTaxonIDs.add(10090); // M musculus allowedTaxonIDs.add(10116); // R norvegicus allowedTaxonIDs.add(83333); //E coli allowedTaxonIDs.add(4932); // S cerevisiae allowedTaxonIDs.add(7227); // D melanogaster allowedTaxonIDs.add(7955); // D rerio allowedTaxonIDs.add(6239); // C elegans allowedTaxonIDs.add(9031); // G gallus new HomologeneMerger("/data/Homologene/homologene.xml", allowedTaxonIDs); } public HomologeneMerger(String homologeneFilename, Set<Integer> taxonIDs){ this.homologeneFilename = homologeneFilename; this.taxonIDs = taxonIDs; } public GeneList merge(GeneList geneList){ Map<String, Gene> eg2gene = createEG2GeneMap(geneList); OneToManySet<String, String> eg2egs = generateMapping(eg2gene); filterMultiPerOrganismLinks(eg2egs, eg2gene); Map<String, Integer> eg2clusterID = convertToClusters(eg2egs); //Go through genes. If gene doesn't belong to cluster, add it to the new genelist: GeneList newGeneList = new GeneList(); OneToManyList<Integer, Gene> cluster2Genes = new OneToManyList<Integer, Gene>(); for (Gene gene : geneList){ boolean inCluster = false; for (DatabaseID databaseID : gene.ids){ if (databaseID.database.equals("EG")){ Integer cluster = eg2clusterID.get(databaseID.ID); if (cluster != null){ cluster2Genes.put(cluster, gene); inCluster = true; break; } } } if (!inCluster) newGeneList.add(gene); } // Merge clusters: int count = 0; int clusterCount = 0; for (Integer cluster : cluster2Genes.keySet()){ Gene homoloGene = new Gene("Homologene"); List<Gene> homologs = cluster2Genes.get(cluster); //Preferred symbol of human has preference, else pick first: for (Gene gene : homologs) if (gene.taxonIDs.contains(9606)) homoloGene.preferredSymbol = gene.preferredSymbol; if (homoloGene.preferredSymbol == null) homoloGene.preferredSymbol = homologs.get(0).preferredSymbol; //Merge homologs: for (Gene gene : homologs) homoloGene.merge(gene); if (homologs.size() > 1){ count += homologs.size(); clusterCount++; } newGeneList.add(homoloGene); } System.out.println("Merged " + count + " genes into "+ clusterCount + " clusters"); return newGeneList; } private Map<String, Gene> createEG2GeneMap(GeneList geneList) { Map<String, Gene> eg2gene = new HashMap<String, Gene>(); for (Gene gene : geneList){ for (DatabaseID dbID : gene.ids) if (dbID.database.equals("EG")) eg2gene.put(dbID.ID, gene); } return eg2gene; } private OneToManySet<String, String> generateMapping(Map<String, Gene> eg2gene){ System.out.println("Loading pairs from homologene"); Map<String, String> gi2egID = new HashMap<String, String>(); String entrezGeneID = null; String gi1 = null; String gi2 = null; OneToManySet<String, String> eg2egs = new OneToManySet<String, String>(); for (String line : new ReadTextFile(homologeneFilename)){ String trimLine = line.trim(); if (trimLine.startsWith("<HG-Gene_geneid>")) entrezGeneID = StringUtilities.findBetween(trimLine, "<HG-Gene_geneid>", "</HG-Gene_geneid>"); else if (trimLine.startsWith("<HG-Gene_taxid>")){ Integer taxonID = Integer.parseInt(StringUtilities.findBetween(trimLine, "<HG-Gene_taxid>", "</HG-Gene_taxid>")); if (!taxonIDs.contains(taxonID)) entrezGeneID = null; } else if (trimLine.startsWith("<HG-Gene_prot-gi>")){ if (entrezGeneID != null){ String gi = StringUtilities.findBetween(trimLine, "<HG-Gene_prot-gi>", "</HG-Gene_prot-gi>"); gi2egID.put(gi, entrezGeneID); } } else if (trimLine.equals("</HG-Gene>")) entrezGeneID = null; else if (trimLine.startsWith("<HG-Stats_gi1>")) gi1 = StringUtilities.findBetween(trimLine, "<HG-Stats_gi1>", "</HG-Stats_gi1>"); else if (trimLine.startsWith("<HG-Stats_gi2>")) gi2 = StringUtilities.findBetween(trimLine, "<HG-Stats_gi2>", "</HG-Stats_gi2>"); else if (trimLine.startsWith("<HG-Stats_recip-best")) { String recip = StringUtilities.findBetween(trimLine, "<HG-Stats_recip-best value=\"", "\"/>"); String eg1 = gi2egID.get(gi1); String eg2 = gi2egID.get(gi2); if (eg1 != null && eg2 != null){ Gene gene1 = eg2gene.get(eg1); Gene gene2 = eg2gene.get(eg2); if (gene1 != null && gene2 != null){ if (areOrthologs(gene1, gene2, recip.equals("true"))) { eg2egs.put(eg1, eg2); eg2egs.put(eg2, eg1); } } } } else if (trimLine.startsWith("</HG-Stats>")) { gi1 = null; gi2 = null; } else if (trimLine.startsWith("</HG-Entry>")) { gi2egID.clear(); } } System.out.println("Created " + recipCount + " pairs due to reciprocity, " + overlapCount + " pairs due to overlap"); return eg2egs; } private void filterMultiPerOrganismLinks(OneToManySet<String, String> eg2egs, Map<String,Gene> eg2gene){ System.out.println("Filtering multiple-per-organism links"); int originalCount = 0; int deletedCount = 0; for (Map.Entry<String, Set<String>> entry : eg2egs.entrySet()){ originalCount += entry.getValue().size(); CountingSet<Integer> taxonCounts = new CountingSet<Integer>(); taxonCounts.add(eg2gene.get(entry.getKey()).taxonIDs.iterator().next()); for (String eg : entry.getValue()) taxonCounts.add(eg2gene.get(eg).taxonIDs.iterator().next()); Iterator<String> iterator = entry.getValue().iterator(); while (iterator.hasNext()){ Integer taxonID = eg2gene.get(iterator.next()).taxonIDs.iterator().next(); if (taxonCounts.getCount(taxonID) != 1){ iterator.remove(); deletedCount++; } } } for (Map.Entry<String, Set<String>> entry : eg2egs.entrySet()){ Iterator<String> iterator = entry.getValue().iterator(); while (iterator.hasNext()){ String eg = iterator.next(); if (!eg2egs.get(eg).contains(entry.getKey())){ iterator.remove(); deletedCount++; } } } System.out.println("Filtering removed " + deletedCount + " of " + originalCount + " pairs"); } private boolean areOrthologs(Gene gene1, Gene gene2, boolean reciprocity) { if (reciprocity){ recipCount++; return true; } if (calculateOverlapScore(gene1, gene2) >= 50){ overlapCount++; return true; } return false; } private int calculateOverlapScore(Gene gene1, Gene gene2) { int score = 0; if (samePreferredSymbol(gene1, gene2)) score += 15; for (String symbol1 : gene1.symbols){ String lcSymbol1 = symbol1.toLowerCase(); for (String symbol2 : gene2.symbols){ String lcSymbol2 = symbol2.toLowerCase(); if (lcSymbol1.equals(lcSymbol2)) if (StringUtilities.containsNumber(symbol1)) score += 20; else score += 10; } } for (String name1 : gene1.names){ String lcName11 = name1.toLowerCase(); for (String name2 : gene2.names){ String lcName2 = name2.toLowerCase(); if (lcName11.equals(lcName2)) if (OntologyUtilities.isGeneSymbol(name1)) score += 15; else score += 25; } } return score; } private boolean samePreferredSymbol(Gene gene1, Gene gene2) { if (gene1.preferredSymbol == null || gene2.preferredSymbol == null) return false; else return (gene1.preferredSymbol.toLowerCase().equals(gene2.preferredSymbol.toLowerCase())); } private Map<String, Integer> convertToClusters(OneToManySet<String, String> eg2egs){ Map<String, Integer> eg2clusterID = new HashMap<String, Integer>(); int nextCluster = 0; for (Map.Entry<String, Set<String>> entry : eg2egs.entrySet()){ Integer cluster = eg2clusterID.get(entry.getKey()); Iterator<String> iterator = entry.getValue().iterator(); while (cluster == null && iterator.hasNext()) cluster = eg2clusterID.get(iterator.next()); if (cluster == null) cluster = nextCluster++; eg2clusterID.put(entry.getKey(), cluster); for (String eg : entry.getValue()) eg2clusterID.put(eg, cluster); } System.out.println(nextCluster + " clusters with an average of " + (eg2clusterID.size() / (double)nextCluster) + " genes"); return eg2clusterID; } }