/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.ontology.ontologyutilities;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.Map.Entry;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.DefaultTypes;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.OntologyFileLoader;
import org.erasmusmc.ontology.OntologyManager;
import org.erasmusmc.ontology.OntologyStore;
import org.erasmusmc.ontology.Relation;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.ontology.ontologyutilities.OntologyUtilities;
import org.erasmusmc.utilities.StringUtilities;
import org.erasmusmc.utilities.WriteTextFile;
public class UMLSGenelistJochemMerger {
public static String umlsGeneOntologyName = "UMLS2010ABHomologeneV5_1";
public static String jochemName = "Jochem_V1_5";
public static String mergedOntologyName = "UMLS2010ABHomologeneJochemV1_1";
public static String normCacheFileName = "/home/public/Peregrine/standardNormCache2006.bin";
public static String tempThesaurusPath = "/home/khettne/temp/overlapTestUMLSGeneChem_V1_1.ontology";
public static String thesaurusPath = "/home/khettne/temp/UMLS2010ABHomologeneJochemV1_1.ontology";
public static WriteTextFile removelog = new WriteTextFile("/home/khettne/temp/removelogUMLSGeneChem_V1_1.log");
public static WriteTextFile mergelog = new WriteTextFile("/home/khettne/temp/mergelogUMLSGeneChem_V1_1.log");
public String ontologyName;
public Integer geneVocIDLimit = 3000000; // Set to -1 to use GENE voc to
// identify gene ontology instead
// (much slower)
public Integer chemVocIDLimit = 4000000;
List<Integer> removelist = new ArrayList<Integer>();
private List<CUImap> mappingsFromToCUI = new ArrayList<CUImap>();
Set<Integer> chemicalSemanticTypesToRemove = getChemicalSemanticTypesToRemove();
Set<Integer> chemicalSemanticTypesToCheckForMerge = getChemicalSemanticTypesToCheckForMerge();
public static void main(String[] args) {
System.out.println(StringUtilities.now() + "\tLoading ontologies");
OntologyManager manager = new OntologyManager();
Ontology umlsGene = manager.fetchStoreFromDatabase(umlsGeneOntologyName);
OntologyManager chemManager = new OntologyManager();
Ontology chemlist = chemManager.fetchStoreFromDatabase(jochemName);
new UMLSGenelistJochemMerger(umlsGene, chemlist, mergedOntologyName);
}
public UMLSGenelistJochemMerger(Ontology umlsGene, Ontology chemlist, String name) {
ontologyName = name;
concatenate(umlsGene, chemlist);
System.gc();
mergedOntology = ontologyManager.fetchClient(ontologyName);
System.out.println(StringUtilities.now() + "\tEvaluating concepts");
findChemicalsInUMLSGenelist();
remove();
System.gc();
OntologyStore toMerge = ontologyManager.fetchStoreFromDatabase(ontologyName);
merge(toMerge);
System.gc();
OntologyFileLoader loader = new OntologyFileLoader();
loader.save((OntologyStore)toMerge, thesaurusPath);
ontologyManager.deleteOntology(ontologyName);
System.out.println(StringUtilities.now() + "\tDone");
}
private void remove() {
System.out.println(StringUtilities.now() + "\tRemoving " + removelist.size() + " concepts");
for (Integer cui: removelist) {
mergedOntology.removeConcept(cui);
}
}
private void merge(OntologyStore toMerge){
for (CUImap cuimap: mappingsFromToCUI) {
OntologyUtilities.mergeConcepts(toMerge, cuimap.from, cuimap.to);
}
}
private void findChemicalsInUMLSGenelist() {
List<Concept> checkForUMLSOverlap = new ArrayList<Concept>();
List<Concept> checkForGeneOverlap = new ArrayList<Concept>();
OntologyStore tempThesaurus = new OntologyStore();
tempThesaurus.setName("overlapTestUMLSGeneChem");
System.gc();
// Evaluate UMLS and Chemical concepts:
Iterator<Concept> conceptIterator = mergedOntology.getConceptIterator();
while (conceptIterator.hasNext()) {
Concept concept = conceptIterator.next();
//Evaluate UMLS concepts
if (concept.getID() < geneVocIDLimit) {
// Check semantic types
boolean potentialChemical = false;
for (Relation relation: mergedOntology.getRelationsForConceptAsSubject(concept.getID(), DefaultTypes.isOfSemanticType)) {
if (chemicalSemanticTypesToRemove.contains(relation.object)){
removelist.add(concept.getID());
} else if (chemicalSemanticTypesToCheckForMerge.contains(relation.object)){
potentialChemical = true;
}
}
if (potentialChemical) {
tempThesaurus.setConcept(concept);
checkForUMLSOverlap.add(concept);
}
}
// Evaluate chemical concepts
else if (concept.getID() >= chemVocIDLimit){
if (concept.getName().toLowerCase().contains(" protein, ")){
removelist.add(concept.getID());
removelog.writeln("REMOVE " + concept.getID() + " BECAUSE OF protein pattern (" + concept.getName() + ")");
} else {
tempThesaurus.setConcept(concept);
checkForGeneOverlap.add(concept);
}
}
else {
tempThesaurus.setConcept(concept);
} // Its gene concept, so add to tempthesaurus for overlap testing
}
OntologyFileLoader loader = new OntologyFileLoader();
loader.save(tempThesaurus, tempThesaurusPath);
Map<Integer, Map<Integer, List<String>>> cui2cuis = fetchUMLSChemOverlap(tempThesaurus);
for (Concept concept: checkForUMLSOverlap) {
String report = overlapsWithUMLS(concept, cui2cuis);
if (report != null) {
mergelog.writeln("MERGE " + concept.getID() + " BECAUSE " + report);
}
}
Map<Integer, Map<Integer, List<String>>> cuiToCuis = fetchGeneChemOverlap(tempThesaurus);
for (Concept concept: checkForGeneOverlap) {
String report = overlapsWithGene(concept, cuiToCuis);
if (report != null) {
mergelog.writeln("MERGE " + concept.getID() + " BECAUSE " + report);
}
}
removelog.close();
mergelog.close();
}
/** private void checkForUMLSOverlap(Ontology tempThesaurus){
Map<Integer, Map<Integer, List<String>>> cui2cuis = fetchUMLSChemOverlap(tempThesaurus);
for (Concept concept: checkForUMLSOverlap) {
String report = overlapsWithUMLS(concept, cui2cuis);
if (report != null) {
mergelog.writeln("MERGE " + concept.getID() + " BECAUSE " + report);
}
}
}*/
/** private void checkForGeneOverlap(Ontology tempThesaurus){
Map<Integer, Map<Integer, List<String>>> cui2cuis = fetchGeneChemOverlap(tempThesaurus);
for (Concept concept: checkForGeneOverlap) {
String report = overlapsWithGene(concept, cui2cuis);
if (report != null) {
mergelog.writeln("MERGE " + concept.getID() + " BECAUSE " + report);
}
}
}*/
// Check whether the string ends on a letter or number or mix
public boolean endsWithID(String name) {
int tokenstart = -1;
for (int i = name.length() - 2; i > 0; i--)
if (!Character.isLetterOrDigit(name.charAt(i))) {
tokenstart = i + 1;
break;
}
if (tokenstart != -1)
if (tokenstart == name.length() - 1 || name.substring(tokenstart, name.length()).equals(name.substring(tokenstart, name.length()).toUpperCase()))
return true;
return false;
}
// Check whether the terms of the concept overlap sufficiently with a umls chemical
private String overlapsWithUMLS(Concept concept, Map<Integer, Map<Integer, List<String>>> cui2cuis) {
Map<Integer, List<String>> id2overlap = cui2cuis.get(concept.getID());
if (id2overlap == null)
return null;
int maxOverlap = 0;
int maxLFOverlap = 0;
int maxOverlapConcept = 0;
for (Entry<Integer, List<String>> entry: id2overlap.entrySet()) {
boolean chemVoc = false;
if (entry.getKey() > chemVocIDLimit)
chemVoc = true;
if (chemVoc) { // it is another chemical: look at overlap
int overlap = entry.getValue().size();
if (overlap >= maxOverlap) {
int lfOverlap = 0;
for (String term: entry.getValue())
if (!OntologyUtilities.isGeneSymbol(term))
lfOverlap++;
if (overlap > maxOverlap || lfOverlap > maxLFOverlap) {
maxOverlap = overlap;
maxLFOverlap = lfOverlap;
maxOverlapConcept = entry.getKey();
}
}
}
}
if (maxOverlap > 0) {
int termcount = concept.getTerms().size();
if (((maxOverlap == termcount) || (maxOverlap >= termcount / 2 && maxLFOverlap > 0)) || (maxLFOverlap == termcount) || (maxLFOverlap > 1)) {
StringBuffer report = new StringBuffer();
report.append("OVERLAP WITH CONCEPT " + maxOverlapConcept + " (");
for (String term: id2overlap.get(maxOverlapConcept)) {
report.append(term);
report.append(";");
}
report.append(")");
mappingsFromToCUI.add(new CUImap(concept.getID(), maxOverlapConcept));
return report.toString();
}
}
return null;
}
// Check whether the terms of the concept overlap sufficiently with a gene
private String overlapsWithGene(Concept concept, Map<Integer, Map<Integer, List<String>>> cui2cuis) {
Map<Integer, List<String>> id2overlap = cui2cuis.get(concept.getID());
if (id2overlap == null)
return null;
int maxOverlap = 0;
int maxLFOverlap = 0;
int maxOverlapConcept = 0;
for (Entry<Integer, List<String>> entry: id2overlap.entrySet()) {
boolean geneVoc = false;
if (entry.getKey() >= geneVocIDLimit && entry.getKey() < chemVocIDLimit)
geneVoc = true;
if (geneVoc) { // it is another gene: look at overlap
int overlap = entry.getValue().size();
if (overlap >= maxOverlap) {
int lfOverlap = 0;
for (String term: entry.getValue())
if (!OntologyUtilities.isGeneSymbol(term))
lfOverlap++;
if (overlap > maxOverlap || lfOverlap > maxLFOverlap) {
maxOverlap = overlap;
maxLFOverlap = lfOverlap;
maxOverlapConcept = entry.getKey();
}
}
}
}
if (maxOverlap > 0) {
int termcount = concept.getTerms().size();
if (((maxOverlap == termcount) || (maxOverlap >= termcount / 2 && maxLFOverlap > 0)) || (maxLFOverlap == termcount) || (maxLFOverlap > 1)) {
StringBuffer report = new StringBuffer();
report.append("OVERLAP WITH CONCEPT " + maxOverlapConcept + " (");
for (String term: id2overlap.get(maxOverlapConcept)) {
report.append(term);
report.append(";");
}
report.append(")");
mappingsFromToCUI.add(new CUImap(concept.getID(), maxOverlapConcept));
return report.toString();
}
}
return null;
}
private Map<Integer, Map<Integer, List<String>>> fetchUMLSChemOverlap(Ontology tempThesaurus) {
System.out.println(StringUtilities.now() + "\tExamining overlap");
HomonymAnalyzer analyzer = new HomonymAnalyzer();
analyzer.normaliser.loadCacheBinary(normCacheFileName);
Iterator<Concept> iterator = tempThesaurus.getConceptIterator();
while (iterator.hasNext())
for (TermStore term: iterator.next().getTerms()) {
term.text = OntologyUtilities.tokenizeAndRemoveStopwordsFromString(term.text, analyzer.stopwords);
OntologyUtilities.setGeneChemMatchingFlags(term);
}
analyzer.setOntology(tempThesaurus);
return analyzer.compareConcepts();
}
private Map<Integer, Map<Integer, List<String>>> fetchGeneChemOverlap(Ontology tempThesaurus) {
System.out.println(StringUtilities.now() + "\tExamining overlap");
HomonymAnalyzer analyzer = new HomonymAnalyzer();
analyzer.normaliser.loadCacheBinary(normCacheFileName);
analyzer.stopwords = HomonymAnalyzer.getDefaultStopWordsForIndexing();
analyzer.stopwords.add("human");
analyzer.stopwords.add("protein");
analyzer.stopwords.add("gene");
analyzer.stopwords.add("antigen");
analyzer.stopwords.add("product");
Iterator<Concept> iterator = tempThesaurus.getConceptIterator();
while (iterator.hasNext())
for (TermStore term: iterator.next().getTerms()) {
term.text = OntologyUtilities.tokenizeAndRemoveStopwordsFromString(term.text, analyzer.stopwords);
OntologyUtilities.setGeneChemMatchingFlags(term);
}
analyzer.setOntology(tempThesaurus);
return analyzer.compareConcepts();
}
private void concatenate(Ontology umlsGene, Ontology chemlist) {
// Concatenate and dump into one ontology(client):
System.out.println(StringUtilities.now() + "\tConcatening ontologies");
umlsGene.setName(ontologyName);
ontologyManager.deleteOntology(ontologyName);
mergedOntology = umlsGene;
Iterator<Concept> conceptIterator = chemlist.getConceptIterator();
while (conceptIterator.hasNext()) {
Concept concept = conceptIterator.next();
if (concept.getID() < -999) {// vocabulary ID for chemlist: make sure no
// overlap with umls and Gene vocs
Concept newConcept = new Concept(concept.getID()-2000);
newConcept.setDefinition(concept.getDefinition());
newConcept.setName(concept.getName());
newConcept.setTerms(concept.getTerms());
concept = newConcept;
}
mergedOntology.setConcept(concept);
chemCUIs.add(concept.getID());
List<DatabaseID> databaseIDs = chemlist.getDatabaseIDsForConcept(concept.getID());
if (databaseIDs != null)
for (DatabaseID databaseID: databaseIDs)
mergedOntology.setDatabaseIDForConcept(concept.getID(), databaseID);
}
for (Relation relation: chemlist.getRelations()) {
if (relation.object < -999)
relation.object -= 2000;
if (relation.subject < -999)
relation.subject -= 2000;
mergedOntology.setRelation(relation);
}
System.out.println(StringUtilities.now() + "\tdumping "+mergedOntology.getName()+" in database");
ontologyManager.dumpStoreInDatabase((OntologyStore) mergedOntology);
}
private Ontology mergedOntology;
private Set<Integer> chemCUIs = new HashSet<Integer>();
private static OntologyManager ontologyManager = new OntologyManager();
private static Set<Integer> getChemicalSemanticTypesToRemove() {
Set<Integer> result = new TreeSet<Integer>();
result.add(-103); //Chemical
result.add(-104); //Chemical viewed structurally
result.add(-109); //Organic chemical
result.add(-114); //Organophosphorous compound
result.add(-115); //Nucleic acid, nucleoside or nucleotide
result.add(-118); //Carbohydrate
result.add(-119); //Lipid
result.add(-110); //Steroid
result.add(-111); //Eicosanoid
result.add(-196); //Element, ion or isotope
result.add(-197); //Inorganic chemical
result.add(-120); //Chemical viewed functionally
result.add(-121); //Pharmacologic substance
result.add(-195); //Antibiotic
result.add(-122); //Biomedical or dental material
result.add(-123); //Biologically active substance
result.add(-124); //Neuroreactive substance or biogenic amine
result.add(-125); //Hormone
result.add(-127); //Vitamin
result.add(-129); //Immunologic factor
result.add(-130); //Indicator, reagent, or diagnostic aid
result.add(-131); //Hazardous or poisonous substance
result.add(-200); //Clinical drug
return result;
}
private static Set<Integer> getChemicalSemanticTypesToCheckForMerge() {
Set<Integer> result = new TreeSet<Integer>();
result.add(-116); //Amino acid, peptide or protein
result.add(-126); //Enzyme
result.add(-192); //Receptor
return result;
}
private class CUImap {
int from;
int to;
public CUImap(int from, int to) {
this.from = from;
this.to = to;
}
}
}