/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.ontology.ontologyutilities; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Map.Entry; import java.util.regex.Pattern; import org.erasmusmc.ids.DatabaseID; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.DefaultTypes; import org.erasmusmc.ontology.Ontology; import org.erasmusmc.ontology.OntologyFileLoader; import org.erasmusmc.ontology.OntologyManager; import org.erasmusmc.ontology.OntologyPSFLoader; import org.erasmusmc.ontology.OntologyStore; import org.erasmusmc.ontology.Relation; import org.erasmusmc.ontology.TermStore; import org.erasmusmc.ontology.ontologyConstructors.OCGenelistHumanV240; import org.erasmusmc.ontology.ontologyConstructors.OCHomologeneV2; import org.erasmusmc.utilities.StringUtilities; import org.erasmusmc.utilities.TextFileUtilities; public class UMLSGenelistMerger { public String ontologyName; public Integer geneVocIDLimit = 3000000; // Set to -1 to use GENE voc to // identify gene ontology instead // (much slower) public Pattern pLetterWordPattern = Pattern.compile("\\W[pP]\\d+\\W"); public Pattern recombinantPattern = Pattern.compile("^recombinant \\w+", Pattern.CASE_INSENSITIVE); public Pattern wtAllelePattern = Pattern.compile(" wt Allele$"); private List<Pattern> organisms; public static String umlsOntologyFilePath = "/home/khettne/Projects/UMLS/2010AB/UMLS2010AB_180211_medlinefilter.ontology"; public static String geneListOntologyFilePath = "/home/khettne/Projects/GeneList/GeneListHumanMouseRatV6_0.ontology"; public static String mergedOntologyName = "UMLS2010ABHomologeneV5_1"; public static String tempPath = "/home/khettne/temp/"; public static String normCacheFileName = "/home/public/Peregrine/standardNormCache2006.bin"; public static void main(String[] args) { System.out.println(StringUtilities.now() + "\tLoading ontologies"); OntologyFileLoader loader = new OntologyFileLoader(); Ontology umls = loader.load(umlsOntologyFilePath); OntologyFileLoader fileLoader = new OntologyFileLoader(); Ontology genelist = fileLoader.load(geneListOntologyFilePath); new UMLSGenelistMerger(umls, genelist, mergedOntologyName); } public UMLSGenelistMerger(Ontology umls, Ontology genelist, String name) { organisms = getOrganismPatterns(umls); ontologyName = name; concatenate(umls, genelist); System.gc(); mergedOntology = ontologyManager.fetchClient(ontologyName); List<Integer> removelist = findGenesInUMLS(); System.gc(); remove(removelist); System.out.println(StringUtilities.now() + "\tDone"); } private void remove(List<Integer> removelist) { System.out.println(StringUtilities.now() + "\tRemoving " + removelist.size() + " concepts"); for (Integer cui: removelist) { mergedOntology.removeConcept(cui); } } private List<Integer> findGenesInUMLS() { List<Integer> removelist = new ArrayList<Integer>(); List<String> logfile = new ArrayList<String>(); List<Concept> checkForOverlap = new ArrayList<Concept>(); OntologyStore tempThesaurus = new OntologyStore(); System.gc(); // Evaluate UMLS concepts: System.out.println(StringUtilities.now() + "\tEvaluating UMLS concepts"); Iterator<Concept> conceptIterator = mergedOntology.getConceptIterator(); while (conceptIterator.hasNext()) { Concept concept = conceptIterator.next(); if (geneVocIDLimit.equals(-1) || concept.getID() < geneVocIDLimit) { // Check semantic types boolean geneOrProtein = false; boolean potentialGeneOrProtein = false; for (Relation relation: mergedOntology.getRelationsForConceptAsSubject(concept.getID(), DefaultTypes.isOfSemanticType)) { if (relation.object == -116) geneOrProtein = true; if (relation.object == -28 || relation.object == -126 || relation.object == -192) potentialGeneOrProtein = true; } if (geneOrProtein) potentialGeneOrProtein = false; // if positively identified as gene, // it is no longer potential! // Check vocabularies boolean geneVoc = false; boolean HUGO = false; if (potentialGeneOrProtein || geneOrProtein) { for (Relation relation: mergedOntology.getRelationsForConceptAsSubject(concept.getID(), DefaultTypes.fromVocabulary)) { if (mergedOntology.getConcept(relation.object).getName().equals("HUGO")) HUGO = true; if (mergedOntology.getConcept(relation.object).getName().equals("GENE")) geneVoc = true; } } // Check whether concept should be removed if (geneVoc) { // ignore gene vocabulary completely tempThesaurus.setConcept(concept); } else { if (HUGO) { // Must remove it removelist.add(concept.getID()); logfile.add("REMOVE " + concept.getID() + " BECAUSE FROM HUGO (" + concept.getName() + ")"); } else if (geneOrProtein || potentialGeneOrProtein) { if (containswtAllelePattern(concept.getTerms())) { removelist.add(concept.getID()); logfile.add("REMOVE " + concept.getID() + " BECAUSE OF wtAllele pattern (" + concept.getName() + ")"); } else if (geneOrProtein && remove116(concept)) { // Sem type 116: // remove if fails // tests removelist.add(concept.getID()); logfile.add("REMOVE " + concept.getID() + " BECAUSE FROM SEM TYPE 116 (" + concept.getName() + ")"); } else { tempThesaurus.setConcept(concept); checkForOverlap.add(concept); } } } } else { tempThesaurus.setConcept(concept); } // Its geneVoc concept, so add to tempthesaurus for overlap testing } OntologyPSFLoader loader = new OntologyPSFLoader(); loader.ontology = tempThesaurus; loader.saveToPSF(tempPath+"overlapTest.psf"); // Do the check for overlap: Map<Integer, Map<Integer, List<String>>> cui2cuis = fetchOverlap(tempThesaurus); for (Concept concept: checkForOverlap) { String report = overlapsWithGene(concept, cui2cuis); if (report != null) { removelist.add(concept.getID()); logfile.add("REMOVE " + concept.getID() + " BECAUSE " + report); } } TextFileUtilities.saveToFile(logfile, tempPath+"mergelog.txt"); return removelist; } private boolean containswtAllelePattern(List<TermStore> terms) { for (TermStore ts: terms) { if (wtAllelePattern.matcher(ts.text).find()) { return true; } } return false; } // Check whether concept of semantic type 116 (protein) can be removed private boolean remove116(Concept concept) { return ( concept.getName().toLowerCase().contains(" protein, ") || StringUtilities.isPlural(concept.getName()) || containsorganism(concept.getName()) || containsPword(concept.getTerms()) || containsRecombinant(concept.getTerms())); } private boolean containsRecombinant(List<TermStore> terms) { for (TermStore term: terms) { if (recombinantPattern.matcher(term.text).find()) { return true; } } return false; } private boolean containsPword(List<TermStore> terms) { for (TermStore term: terms) { if (pLetterWordPattern.matcher(term.text).find()) { return true; } } return false; } private boolean containsorganism(String name) { for (Pattern organism: organisms) { if (organism.matcher(name).find()) return true; } return false; } private List<Pattern> getOrganismPatterns(Ontology ontology) { System.out.println("Detecting organisms names"); Set<String> organisms = new HashSet<String>(); for (Relation relation : ontology.getRelationsForConceptAsObject(-116, DefaultTypes.isOfSemanticType)){ Concept concept = ontology.getConcept(relation.subject); if (concept.getName().toLowerCase().contains(" protein, ")) { String name = concept.getName(); int i = name.toLowerCase().indexOf("protein, "); organisms.add(name.substring(i+10, name.length())); } } List<Pattern> result = new ArrayList<Pattern>(); for (String organism : organisms){ result.add(Pattern.compile("\\W" + organism + "\\W")); } System.out.println("Found " + result.size() + " organism names"); return result; } // Check whether the string ends on a letter or number or mix public boolean endsWithID(String name) { int tokenstart = -1; for (int i = name.length() - 2; i > 0; i--) if (!Character.isLetterOrDigit(name.charAt(i))) { tokenstart = i + 1; break; } if (tokenstart != -1) if (tokenstart == name.length() - 1 || name.substring(tokenstart, name.length()).equals(name.substring(tokenstart, name.length()).toUpperCase())) return true; return false; } // Check whether the terms of the concept overlap sufficiently with a gene private String overlapsWithGene(Concept concept, Map<Integer, Map<Integer, List<String>>> cui2cuis) { Map<Integer, List<String>> id2overlap = cui2cuis.get(concept.getID()); if (id2overlap == null) return null; int maxOverlap = 0; int maxLFOverlap = 0; int maxOverlapConcept = 0; for (Entry<Integer, List<String>> entry: id2overlap.entrySet()) { boolean geneVoc = false; if (geneVocIDLimit.equals(-1)) { for (Relation relation: mergedOntology.getRelationsForConceptAsSubject(entry.getKey(), DefaultTypes.fromVocabulary)) if (mergedOntology.getConcept(relation.object).getName().equals("GENE")) geneVoc = true; } else if (entry.getKey() > geneVocIDLimit) geneVoc = true; if (geneVoc) { // it is another gene: look at overlap int overlap = entry.getValue().size(); if (overlap >= maxOverlap) { int lfOverlap = 0; for (String term: entry.getValue()) if (!OntologyUtilities.isGeneSymbol(term)) lfOverlap++; if (overlap > maxOverlap || lfOverlap > maxLFOverlap) { maxOverlap = overlap; maxLFOverlap = lfOverlap; maxOverlapConcept = entry.getKey(); } } } } if (maxOverlap > 0) { int termcount = concept.getTerms().size(); if (maxOverlap >= termcount / 2 || (maxLFOverlap > 0)) { StringBuffer report = new StringBuffer(); report.append("OVERLAP WITH CONCEPT " + maxOverlapConcept + " ("); for (String term: id2overlap.get(maxOverlapConcept)) { report.append(term); report.append(";"); } report.append(")"); return report.toString(); } } return null; } private Map<Integer, Map<Integer, List<String>>> fetchOverlap(OntologyStore tempThesaurus) { System.out.println(StringUtilities.now() + "\tExamining overlap"); HomonymAnalyzer analyzer = new HomonymAnalyzer(); analyzer.normaliser.loadCacheBinary(normCacheFileName); analyzer.stopwords = HomonymAnalyzer.getDefaultStopWordsForIndexing(); analyzer.stopwords.add("human"); analyzer.stopwords.add("protein"); analyzer.stopwords.add("gene"); analyzer.stopwords.add("antigen"); analyzer.stopwords.add("product"); Iterator<Concept> iterator = tempThesaurus.getConceptIterator(); while (iterator.hasNext()) for (TermStore term: iterator.next().getTerms()) { term.text = OntologyUtilities.tokenizeAndRemoveStopwordsFromString(term.text, analyzer.stopwords); OntologyUtilities.setGeneChemMatchingFlags(term); } analyzer.setOntology(tempThesaurus); return analyzer.compareConcepts(); } private void concatenate(Ontology umls, Ontology genelist) { // Concatenate and dump into one ontology(client): System.out.println(StringUtilities.now() + "\tConcatening ontologies"); umls.setName(ontologyName); //ontologyManager.deleteOntology(ontologyName); // ontologyManager.dumpStoreInDatabase(umls); // mergedOntology = ontologyManager.fetchClient(umls.getName()); mergedOntology = umls; Iterator<Concept> conceptIterator = genelist.getConceptIterator(); while (conceptIterator.hasNext()) { Concept concept = conceptIterator.next(); if (concept.getID() < -999) {// vocabulary ID for genelist: make sure no // overlap with umls vocs Concept newConcept = new Concept(concept.getID() - 1000); newConcept.setDefinition(concept.getDefinition()); newConcept.setName(concept.getName()); newConcept.setTerms(concept.getTerms()); concept = newConcept; } mergedOntology.setConcept(concept); geneCUIs.add(concept.getID()); List<DatabaseID> databaseIDs = genelist.getDatabaseIDsForConcept(concept.getID()); if (databaseIDs != null) for (DatabaseID databaseID: databaseIDs) mergedOntology.setDatabaseIDForConcept(concept.getID(), databaseID); } for (Relation relation: genelist.getRelations()) { if (relation.object < -999) relation.object -= 1000; if (relation.subject < -999) relation.subject -= 1000; mergedOntology.setRelation(relation); } ontologyManager.dumpStoreInDatabase((OntologyStore) mergedOntology); } private Ontology mergedOntology; private Set<Integer> geneCUIs = new HashSet<Integer>(); private static OntologyManager ontologyManager = new OntologyManager(); }