/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.dataimport.genes.ontologyBuilder; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import org.erasmusmc.collections.CountingSet; import org.erasmusmc.collections.OneToManyList; import org.erasmusmc.ids.DatabaseID; import org.erasmusmc.utilities.StringUtilities; public class DatabaseMerger { private OneToManyList<DatabaseID, Gene> ids2geneInfos = new OneToManyList<DatabaseID, Gene>(); private GeneList geneList = new GeneList(); private CountingSet<String> overlapCounts = new CountingSet<String>(); public void merge(GeneList newGeneInfos){ overlapCounts.clear(); for (Gene gene : newGeneInfos){ Set<Gene> matches = findMatches(gene); if (matches.size() == 0) add(gene); else { int merged = 0; for (Gene match : matches) if (validMatch(match, gene)){ merge(match, gene); merged++; } if (merged == 0) add(gene); else if (merged > 1) System.out.println(merged + " matches found for " + gene.preferredSymbol); } } System.out.println("Overlap statistics: "); overlapCounts.printCounts(); } private boolean validMatch(Gene match, Gene gene) { List<String> overlap = new ArrayList<String>(); Set<String> agreement = new HashSet<String>(); Set<String> disagreement = new HashSet<String>(); for (DatabaseID databaseID1 : gene.ids) for (DatabaseID databaseID2 : match.ids) if (databaseID1.database.equals(databaseID2.database)) if (databaseID1.ID.equals(databaseID2.ID)){ overlap.add(databaseID1.database); agreement.add(databaseID1.database); } else disagreement.add(databaseID1.database); int unresolvedDisagreements = 0; for (String database : disagreement){ if (!agreement.contains(database)) unresolvedDisagreements++; } //int resolvedDisagreements = disagreement.size() - unresolvedDisagreements; if (agreement.size() > unresolvedDisagreements){ Collections.sort(overlap); overlapCounts.add(StringUtilities.join(overlap, " ")); return true; } else return false; } private void merge(Gene match, Gene geneInfo) { for (DatabaseID databaseID : geneInfo.ids) if (match.ids.add(databaseID)) ids2geneInfos.put(databaseID, match); match.names.addAll(geneInfo.names); match.symbols.addAll(geneInfo.symbols); } private void add(Gene gene) { geneList.add(gene); for (DatabaseID databaseID : gene.ids) ids2geneInfos.put(databaseID, gene); } private Set<Gene> findMatches(Gene geneInfo) { Set<Gene> result = new HashSet<Gene>(); for (DatabaseID databaseID : geneInfo.ids) result.addAll(ids2geneInfos.get(databaseID)); return result; } public GeneList getMergedGeneList(){ return geneList; } }