/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.ontology.ontologyutilities;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.Map.Entry;
import org.erasmusmc.collections.SortedIntListSet;
import org.erasmusmc.math.vector.SparseVectorInt2Float;
import org.erasmusmc.math.vector.VectorCursor;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.utilities.SetUtilities;
import org.erasmusmc.utilities.TextFileUtilities;
public class ConceptMerger {
// Merged concepts start from:
public static int startConceptNumber = 4000000;
// rules
public static boolean mergeIdentical = true;
public static boolean mergeWhenSubSet = true;
public static double minDiceForMergeWhenPreferredTermsMatch = 0.35;
public static double minDiceForMerge = 0.4d;
public static double baseScore = 0.01;
/**
* this number of terms should overlap or more.
*/
public static int minimumOverlapForDiceMerge = 2;
public static Map<Integer, Set<Integer>> conceptMerge(Ontology ontology, String logfile,boolean conservative) {
Map<Integer, Set<Integer>> mappings = conceptMerge(ontology, conservative);
List<String> lines = new ArrayList<String>();
for (Entry<Integer, Set<Integer>> mapping: mappings.entrySet()) {
lines.add(mapping.getKey() + ": " + mapping.getValue().toString());
}
TextFileUtilities.saveToFile(lines, logfile);
return mappings;
}
public static Map<Integer, Set<Integer>> conceptMerge(Ontology ontology, boolean conservativeMergeAlgorithm) {
Map<Integer, Set<Integer>> mappings;
if(conservativeMergeAlgorithm)
mappings=conservativeConceptMerge(ontology);
else
mappings= greedyConceptMerge(ontology);
return mappings;
}
private static Map<Integer, SparseVectorInt2Float> getOverlapMap(Ontology ontology) {
HomonymAnalyzer homonymAnalyzer = new HomonymAnalyzer();
homonymAnalyzer.setOntology(ontology);
Map<Integer, SparseVectorInt2Float> overlap = homonymAnalyzer.compareConceptsLight();
return overlap;
}
public static void addRelation(int id1, int id2, Map<Integer, Set<Integer>> mapping) {
Set<Integer> mapping1 = mapping.get(id1);
if (mapping1 == null) {
mapping1 = new SortedIntListSet();
mapping.put(id1, mapping1);
}
Set<Integer> mapping2 = mapping.get(id2);
if (mapping2 == null) {
mapping2 = new SortedIntListSet();
mapping.put(id2, mapping2);
}
mapping1.add(id2);
mapping2.add(id1);
}
private static Map<Integer, Set<Integer>> getMappings(Ontology ontology, Map<Integer, SparseVectorInt2Float> overlapMapping) {
Map<Integer, Set<Integer>> mappings = new TreeMap<Integer, Set<Integer>>();
for (Entry<Integer, SparseVectorInt2Float> entry: overlapMapping.entrySet()) {
Concept concept1 = ontology.getConcept(entry.getKey());
SparseVectorInt2Float homonyms = entry.getValue();
VectorCursor<Integer> vc = homonyms.getNonzeroCursor();
while (vc.isValid()) {
int value = (int) Math.round(vc.get());
Concept concept2 = ontology.getConcept(vc.dimension());
if (shouldBeMerged(concept1, concept2, value)) {
addRelation(concept1.getID(), concept2.getID(), mappings);
}
vc.next();
}
}
return mappings;
}
public static Map<Integer, Set<Integer>> conservativeConceptMerge(Ontology ontology) {
Integer numberOfMappings;
Map<Integer, Set<Integer>> out = new TreeMap<Integer, Set<Integer>>();
do {
Set<Set<Integer>> uniqueMappings = new HashSet<Set<Integer>>();
Map<Integer, SparseVectorInt2Float> overlap = getOverlapMap(ontology);
Map<Integer, Set<Integer>> pairwiseMapping = getMappings(ontology, overlap);
List<Integer> ids = new ArrayList<Integer>(pairwiseMapping.keySet());
numberOfMappings = 0;
for (Integer id: ids) {
Set<Integer> mapClique = pruneCliqueForConcept(id, pairwiseMapping, overlap, ontology);
removeCliqueFromMapping(mapClique, pairwiseMapping);
if (mapClique.size() != 0) {
uniqueMappings.add(mapClique);
numberOfMappings += mapClique.size();
}
}
out.putAll(performMappings(uniqueMappings, ontology));
System.out.println("generated " + numberOfMappings.toString() + " mappings");
} while (numberOfMappings > 0);
return out;
}
private static Map<Integer, Set<Integer>> performMappings(Set<Set<Integer>> uniqueMappings, Ontology ontology) {
Map<Integer, Set<Integer>> out = new TreeMap<Integer, Set<Integer>>();
for (Set<Integer> mapping: uniqueMappings) {
Concept newConcept = new Concept(startConceptNumber);
out.put(startConceptNumber, mapping);
ontology.setConcept(newConcept);
startConceptNumber++;
for (Integer fromid: mapping) {
OntologyUtilities.mergeConcepts(ontology, fromid, newConcept.getID());
}
}
return out;
}
private static void removeCliqueFromMapping(Set<Integer> mapClique, Map<Integer, Set<Integer>> mapping) {
for (Integer id: mapClique) {
Set<Integer> targets = mapping.remove(id);
for (Integer target: targets) {
Set<Integer> targetSet = mapping.get(target);
if (targetSet != null) {
targetSet.remove(id);
}
}
}
}
private static Set<Integer> pruneCliqueForConcept(Integer id, Map<Integer, Set<Integer>> mapping, Map<Integer, SparseVectorInt2Float> overlap, Ontology ontology) {
Set<Integer> cli = getPotentialClique(id, mapping);
while (!trueClique(cli, mapping)) {
pruneClique(cli, overlap, ontology);
}
return cli;
}
private static void pruneClique(Set<Integer> cli, Map<Integer, SparseVectorInt2Float> overlapMap, Ontology ontology) {
Integer lowest = null;
Float lowestScore = Float.MAX_VALUE;
//List<Integer> ids = new ArrayList<Integer>(cli);
for (Integer id: cli) {
double score = 1d;
for (Integer id2: cli) {
double overlap;
if (id != id2) {
if (id > id2) {
overlap = overlapMap.get(id2).get(id);
}
else {
overlap = overlapMap.get(id).get(id2);
}
double inbetween = score(ontology.getConcept(id), ontology.getConcept(id2), overlap);
if (inbetween == 0) {
inbetween = baseScore;
}
score *= inbetween;
}
}
if (score < lowestScore) {
lowest = id;
}
}
cli.remove(lowest);
}
private static double score(Concept concept1, Concept concept2, double overlap) {
if (shouldBeMerged(concept1, concept2, (int) Math.round(overlap))) {
return 1d;
}
Double dice = 2d * overlap / (double) (concept1.getTerms().size() + concept2.getTerms().size());
if (doPreferredTermsMatch(concept1, concept2)) {
double bonus = minDiceForMergeWhenPreferredTermsMatch - minDiceForMerge;
return (dice + bonus) <= 1 ? dice + bonus : dice;
}
return dice;
}
@SuppressWarnings("unchecked")
public static boolean trueClique(Set<Integer> cli, Map<Integer, Set<Integer>> mapping) {
if (cli.size() == 0) {
return true;
}
if (cli.size() == 1) {
return true;
}
for (Integer id: cli) {
Set<Integer> map = mapping.get(id);
if (map != null) {
Set<Integer> remainder = SetUtilities.substraction(cli, map);
if (remainder.size() > 1 || (remainder.size() == 1 && !remainder.contains(id))) {
return false;
}
}
else {
return false;
}
}
return true;
}
public static Set<Integer> getPotentialClique(Integer id, Map<Integer, Set<Integer>> mapping) {
Set<Integer> result = new HashSet<Integer>();
addNewConcepts(id, result, mapping);
return result;
}
private static void addNewConcepts(Integer id, Set<Integer> result, Map<Integer, Set<Integer>> mapping) {
if (!result.contains(id)) {
Set<Integer> maps = mapping.get(id);
if (maps != null) {
result.add(id);
for (Integer id2: maps) {
addNewConcepts(id2, result, mapping);
}
}
}
}
public static Map<Integer, Set<Integer>> greedyConceptMerge(Ontology ontology) {
Map<Integer, SparseVectorInt2Float> overlap = getOverlapMap(ontology);
Map<Integer, Set<Integer>> mappings = new TreeMap<Integer, Set<Integer>>();
for (Entry<Integer, SparseVectorInt2Float> entry: overlap.entrySet()) {
Concept concept1 = ontology.getConcept(entry.getKey());
SparseVectorInt2Float homonyms = entry.getValue();
VectorCursor<Integer> vc = homonyms.getNonzeroCursor();
while (vc.isValid()) {
int value = (int) Math.round(vc.get());
Concept concept2 = ontology.getConcept(vc.dimension());
if (shouldBeMerged(concept1, concept2, value)) {
Set<Integer> mapping1 = mappings.get(concept1.getID());
if (mapping1 == null) {
mapping1 = new SortedIntListSet();
mapping1.add(concept1.getID());
}
Set<Integer> mapping2 = mappings.get(concept2.getID());
if (mapping2 == null) {
mapping2 = new SortedIntListSet();
mapping2.add(concept2.getID());
}
mapping1.addAll(mapping2);
for (Integer id: mapping1) {
mappings.put(id, mapping1);
}
}
vc.next();
}
}
Integer currentNewCUI = startConceptNumber;
Set<Set<Integer>> UniqueMappings = new HashSet<Set<Integer>>(mappings.values());
mappings = null;
Map<Integer, Set<Integer>> out = new TreeMap<Integer, Set<Integer>>();
int count = 0;
for (Set<Integer> mapping: UniqueMappings) {
Concept newConcept = new Concept(currentNewCUI);
out.put(currentNewCUI, mapping);
ontology.setConcept(newConcept);
currentNewCUI++;
for (Integer fromid: mapping)
OntologyUtilities.mergeConcepts(ontology, fromid, newConcept.getID());
count += mapping.size();
}
System.out.println("Greedy concept merge merged " + count + " concepts");
return out;
}
protected static boolean isSubset(Integer numberOfHits, Concept concept1, Concept concept2) {
if (numberOfHits == concept1.getTerms().size() || numberOfHits == concept2.getTerms().size()) {
return true;
}
return false;
}
protected static boolean shouldBeMerged(Concept concept1, Concept concept2, int numberofhits) {
boolean preferredTermsMatch = doPreferredTermsMatch(concept1, concept2);
boolean subSet = isSubset(numberofhits, concept1, concept2);
Double dice = (double) 2d * numberofhits / (double) (concept1.getTerms().size() + concept2.getTerms().size());
return shouldBeMerged(preferredTermsMatch, subSet, dice, numberofhits);
}
protected static boolean shouldBeMerged(boolean preferredTermsMatch, boolean subSet, double dice, int numberofhits) {
if (dice == 1d && mergeIdentical) {
return true;
}
if (subSet && mergeWhenSubSet) {
return true;
}
if (preferredTermsMatch && dice >= minDiceForMergeWhenPreferredTermsMatch && numberofhits >= minimumOverlapForDiceMerge) {
return true;
}
if (dice >= minDiceForMerge && numberofhits >= minimumOverlapForDiceMerge) {
return true;
}
return false;
}
protected static boolean doPreferredTermsMatch(Concept concept1, Concept concept2) {
String pref1 = getPreferredTerm(concept1);
String pref2 = getPreferredTerm(concept2);
if (!pref1.equals("") && !pref2.equals("")) {
if (pref1.compareToIgnoreCase(pref2) == 0) {
return true;
}
}
return false;
}
protected static String getPreferredTerm(Concept concept) {
List<TermStore> terms = concept.getTerms();
String preferredterm = "";
if (terms.size() > 0) {
preferredterm = terms.get(0).text;
}
return preferredterm;
}
}