/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.dataimport.genes.ontologyBuilder; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.erasmusmc.collections.OneToManyList; import org.erasmusmc.collections.OneToManySet; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.DefaultTypes; import org.erasmusmc.ontology.Ontology; import org.erasmusmc.ontology.Relation; import org.erasmusmc.ontology.TermStore; import org.erasmusmc.ontology.ontologyutilities.HomonymAnalyzer; import org.erasmusmc.ontology.ontologyutilities.OntologyUtilities; import org.erasmusmc.peregrine.AbstractPeregrine; import org.erasmusmc.utilities.StringUtilities; public class NameMerger { public static int minOverlapScore = 50; public static String normCacheFileName = "/home/public/Peregrine/standardNormCache2006.bin"; public static int startConceptNumber = 4000000; private Ontology ontology; public void merge(Ontology ontology){ this.ontology = ontology; Map<Integer,Map<Integer,List<String>>> homonyms = findHomonyms(); OneToManySet<Integer, Integer> mapping = findSufficientOverlap(homonyms); Map<Integer, Integer> concept2clusterID = createClusters(mapping); mergeClusters(concept2clusterID); } private void mergeClusters(Map<Integer, Integer> concept2clusterID) { OneToManyList<Integer, Integer> cluster2conceptID = new OneToManyList<Integer, Integer>(); for (Map.Entry<Integer, Integer> entry : concept2clusterID.entrySet()) cluster2conceptID.put(entry.getValue(), entry.getKey()); int newCID = startConceptNumber; for (Integer clusterID : cluster2conceptID.keySet()){ List<Integer> cluster = cluster2conceptID.get(clusterID); Integer humanCID = findHuman(cluster); Concept concept = new Concept(newCID); ontology.setConcept(concept); if (humanCID != null) OntologyUtilities.mergeConcepts(ontology, humanCID, newCID); for (Integer conceptID : cluster) if (conceptID != humanCID) OntologyUtilities.mergeConcepts(ontology, conceptID, newCID); newCID++; } } private Integer findHuman(List<Integer> cluster) { for (Integer conceptID : cluster){ List<Relation> relations = ontology.getRelationsForConceptAsSubject(conceptID, DefaultTypes.fromVocabulary); for (Relation relation : relations) if (relation.object == -1001) return conceptID; } return null; } private Map<Integer, Integer> createClusters(OneToManySet<Integer, Integer> mapping) { Map<Integer, Integer> concept2clusterID = new HashMap<Integer, Integer>(); int nextCluster = 0; for (Map.Entry<Integer, Set<Integer>> entry : mapping.entrySet()){ Integer cluster = concept2clusterID.get(entry.getKey()); Iterator<Integer> iterator = entry.getValue().iterator(); while (cluster == null && iterator.hasNext()) cluster = concept2clusterID.get(iterator.next()); if (cluster == null) cluster = nextCluster++; concept2clusterID.put(entry.getKey(), cluster); for (Integer cid : entry.getValue()) concept2clusterID.put(cid, cluster); } System.out.println(nextCluster + " clusters with an average of " + (concept2clusterID.size() / (double)nextCluster) + " genes"); return concept2clusterID; } private OneToManySet<Integer, Integer> findSufficientOverlap(Map<Integer, Map<Integer, List<String>>> homonyms) { OneToManySet<Integer, Integer> mapping = new OneToManySet<Integer, Integer>(); int startCount = 0; int endCount = 0; for (Map.Entry<Integer, Map<Integer, List<String>>> entry : homonyms.entrySet()){ Integer conceptID1 = entry.getKey(); for (Map.Entry<Integer,List<String>> entry2 : entry.getValue().entrySet()){ startCount++; Integer conceptID2 = entry2.getKey(); int overlapScore = computeOverlapScore(conceptID1, conceptID2, entry2.getValue()); if (overlapScore >= minOverlapScore){ endCount++; mapping.put(conceptID1, conceptID2); } } } System.out.println("Of the " + startCount + " homonym concept pairs, " + endCount + " will be merged"); return mapping; } private int computeOverlapScore(Integer conceptID1, Integer conceptID2,List<String> overlapTerms) { int score = 0; if (identicalPreferredTerm(conceptID1, conceptID2)) score += 15; for (String term : overlapTerms){ if (OntologyUtilities.isGeneSymbol(term)) if (StringUtilities.containsNumber(term)) score += 20; //Symbol with number else score += 10; //Symbol without number else score += 25; //Long form } return score; } private boolean identicalPreferredTerm(Integer conceptID1, Integer conceptID2) { String term1 = ontology.getConcept(conceptID1).getName(); String term2 = ontology.getConcept(conceptID2).getName(); return term1.toLowerCase().equals(term2.toLowerCase()); } private Map<Integer, Map<Integer, List<String>>> findHomonyms() { for (Concept concept : ontology) for (TermStore term : concept.getTerms()) OntologyUtilities.setGeneChemMatchingFlags(term); HomonymAnalyzer homonymAnalyzer = new HomonymAnalyzer(); homonymAnalyzer.normaliser.loadCacheBinary(normCacheFileName); homonymAnalyzer.stopwords = AbstractPeregrine.getDefaultStopWordsForIndexing(); homonymAnalyzer.stopwords.add("human"); homonymAnalyzer.stopwords.add("protein"); homonymAnalyzer.stopwords.add("gene"); homonymAnalyzer.stopwords.add("antigen"); homonymAnalyzer.stopwords.add("product"); homonymAnalyzer.setOntology(ontology); return homonymAnalyzer.compareConcepts(); } }