/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.ontology.ontologyutilities;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.OntologyFileLoader;
import org.erasmusmc.ontology.OntologyManager;
import org.erasmusmc.ontology.OntologyStore;
import org.erasmusmc.ontology.Relation;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.ontology.ontologyutilities.OntologyUtilities;
import org.erasmusmc.utilities.StringUtilities;
import org.erasmusmc.utilities.WriteTextFile;
public class UMLSGenelistJochemToxMerger {
public static String umlsGeneChemOntologyName = "UMLS2010ABHomologeneJochemV1_1";
public static String toxGlossaryName = "toxGlossary_310811";
public static String mergedOntologyName = "UMLS2010ABHomologeneJochemToxV1_6";
public static String normCacheFileName = "/home/public/Peregrine/standardNormCache2006.bin";
public static String tempThesaurusPath = "/home/khettne/temp/overlapTestUMLStox.ontology";
public static String thesaurusPath = "/home/khettne/temp/UMLS2010ABHomologeneJochemToxV1_6.ontology";
public static WriteTextFile mergelog = new WriteTextFile("/home/khettne/temp/mergelogTox.log");
public String ontologyName;
public Integer geneVocIDLimit = 3000000; // Set to -1 to use GENE voc to
// identify gene ontology instead
// (much slower)
public Integer toxVocIDLimit = 2999000;
private List<CUImap> mappingsFromToCUI = new ArrayList<CUImap>();
List<Concept> checkForToxOverlap = new ArrayList<Concept>();
public static void main(String[] args) {
System.out.println(StringUtilities.now() + "\tLoading ontologies");
//OntologyManager manager = new OntologyManager();
//Ontology umls = manager.fetchStoreFromDatabase(umlsGeneChemOntologyName);
OntologyFileLoader loader = new OntologyFileLoader();
Ontology umls = loader.load("/home/khettne/temp/"+umlsGeneChemOntologyName+".ontology");
OntologyManager toxManager = new OntologyManager();
Ontology toxlist = toxManager.fetchStoreFromDatabase(toxGlossaryName);
new UMLSGenelistJochemToxMerger(umls, toxlist, mergedOntologyName);
}
public UMLSGenelistJochemToxMerger(Ontology umls, Ontology toxlist, String name) {
ontologyName = name;
concatenate(umls, toxlist);
System.gc();
mergedOntology = ontologyManager.fetchClient(ontologyName);
System.out.println(StringUtilities.now() + "\tEvaluating UMLS concepts");
OntologyStore tempThesaurus = findToxConceptsInUMLS();
System.gc();
checkForToxOverlap(tempThesaurus);
mergelog.close();
System.gc();
OntologyStore toMerge = ontologyManager.fetchStoreFromDatabase(ontologyName);
merge(toMerge);
OntologyFileLoader loader = new OntologyFileLoader();
loader.save((OntologyStore)toMerge, thesaurusPath);
ontologyManager.deleteOntology(ontologyName);
System.out.println(StringUtilities.now() + "\tDone");
}
private void merge(OntologyStore toMerge){
for (CUImap cuimap: mappingsFromToCUI) {
OntologyUtilities.mergeConcepts(toMerge, cuimap.from, cuimap.to);
}
}
private OntologyStore findToxConceptsInUMLS() {
OntologyStore tempThesaurus = new OntologyStore();
tempThesaurus.setName("overlapTestUMLStox");
System.gc();
// Evaluate UMLS concepts:
Iterator<Concept> conceptIterator = mergedOntology.getConceptIterator();
while (conceptIterator.hasNext()) {
Concept concept = conceptIterator.next();
//Evaluate UMLS concepts
if (concept.getID() >= toxVocIDLimit && concept.getID() < geneVocIDLimit) {
checkForToxOverlap.add(concept);
tempThesaurus.setConcept(concept);
}
else {
tempThesaurus.setConcept(concept);
} // Its umls concept, so add to tempthesaurus for overlap testing
}
OntologyFileLoader loader = new OntologyFileLoader();
loader.save(tempThesaurus,tempThesaurusPath);
return tempThesaurus;
}
private void checkForToxOverlap(Ontology tempThesaurus){
Map<Integer, Map<Integer, List<String>>> cui2cuis = fetchToxOverlap(tempThesaurus);
for (Concept concept: checkForToxOverlap) {
String report = overlapsWithTox(concept, cui2cuis);
if (report != null) {
mergelog.writeln("MERGE " + concept.getID() + " BECAUSE " + report);
}
}
}
// Check whether the string ends on a letter or number or mix
public boolean endsWithID(String name) {
int tokenstart = -1;
for (int i = name.length() - 2; i > 0; i--)
if (!Character.isLetterOrDigit(name.charAt(i))) {
tokenstart = i + 1;
break;
}
if (tokenstart != -1)
if (tokenstart == name.length() - 1 || name.substring(tokenstart, name.length()).equals(name.substring(tokenstart, name.length()).toUpperCase()))
return true;
return false;
}
// Check whether the terms of the concept overlap sufficiently with a tox concept
private String overlapsWithTox(Concept concept, Map<Integer, Map<Integer, List<String>>> cui2cuis) {
Map<Integer, List<String>> id2overlap = cui2cuis.get(concept.getID());
if (id2overlap == null)
return null;
int maxOverlap = 0;
int maxLFOverlap = 0;
int maxOverlapConcept = 0;
for (Entry<Integer, List<String>> entry: id2overlap.entrySet()) {
boolean toxVoc = false;
if (!(entry.getKey() >= toxVocIDLimit && entry.getKey() < geneVocIDLimit))
toxVoc = true;
if (toxVoc) { // it is another gene: look at overlap
int overlap = entry.getValue().size();
if (overlap >= maxOverlap) {
int lfOverlap = 0;
for (String term: entry.getValue())
if (!OntologyUtilities.isGeneSymbol(term))
lfOverlap++;
if (overlap > maxOverlap || lfOverlap > maxLFOverlap) {
maxOverlap = overlap;
maxLFOverlap = lfOverlap;
maxOverlapConcept = entry.getKey();
}
}
}
}
if (maxOverlap > 0) {
int termcount = concept.getTerms().size();
if (((maxOverlap == termcount) || (maxOverlap >= termcount / 2 && maxLFOverlap > 0)) || (maxLFOverlap == termcount) || (maxLFOverlap > 1)) {
StringBuffer report = new StringBuffer();
report.append("OVERLAP WITH CONCEPT " + maxOverlapConcept + " (");
for (String term: id2overlap.get(maxOverlapConcept)) {
report.append(term);
report.append(";");
}
report.append(")");
mappingsFromToCUI.add(new CUImap(concept.getID(), maxOverlapConcept));
return report.toString();
}
}
return null;
}
private Map<Integer, Map<Integer, List<String>>> fetchToxOverlap(Ontology tempThesaurus) {
System.out.println(StringUtilities.now() + "\tExamining overlap");
HomonymAnalyzer analyzer = new HomonymAnalyzer();
analyzer.normaliser.loadCacheBinary(normCacheFileName);
Iterator<Concept> iterator = tempThesaurus.getConceptIterator();
while (iterator.hasNext())
for (TermStore term: iterator.next().getTerms()) {
term.text = OntologyUtilities.tokenizeAndRemoveStopwordsFromString(term.text, analyzer.stopwords);
OntologyUtilities.setDefaultMatchingFlags(term);
}
analyzer.setOntology(tempThesaurus);
return analyzer.compareConcepts();
}
private void concatenate(Ontology umlsGene, Ontology toxlist) {
// Concatenate and dump into one ontology(client):
System.out.println(StringUtilities.now() + "\tConcatening ontologies");
umlsGene.setName(ontologyName);
ontologyManager.deleteOntology(ontologyName);
mergedOntology = umlsGene;
Iterator<Concept> conceptIterator = toxlist.getConceptIterator();
while (conceptIterator.hasNext()) {
Concept concept = conceptIterator.next();
if (concept.getID() < -999) {// vocabulary ID for toxlist: make sure no
// overlap with umls and Gene vocs
Concept newConcept = new Concept(concept.getID()-3000);
newConcept.setDefinition(concept.getDefinition());
newConcept.setName(concept.getName());
newConcept.setTerms(concept.getTerms());
concept = newConcept;
}
mergedOntology.setConcept(concept);
chemCUIs.add(concept.getID());
List<DatabaseID> databaseIDs = toxlist.getDatabaseIDsForConcept(concept.getID());
if (databaseIDs != null)
for (DatabaseID databaseID: databaseIDs)
mergedOntology.setDatabaseIDForConcept(concept.getID(), databaseID);
}
for (Relation relation: toxlist.getRelations()) {
if (relation.object < -999)
relation.object -= 3000;
if (relation.subject < -999)
relation.subject -= 3000;
mergedOntology.setRelation(relation);
}
System.out.println(StringUtilities.now() + "\tdumping "+mergedOntology.getName()+" in database");
ontologyManager.dumpStoreInDatabase((OntologyStore) mergedOntology);
}
private Ontology mergedOntology;
private Set<Integer> chemCUIs = new HashSet<Integer>();
private static OntologyManager ontologyManager = new OntologyManager();
private class CUImap {
int from;
int to;
public CUImap(int from, int to) {
this.from = from;
this.to = to;
}
}
}