/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.dataimport.UMLS; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.regex.Pattern; import org.erasmusmc.ids.DatabaseID; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.DefaultTypes; import org.erasmusmc.ontology.Ontology; import org.erasmusmc.ontology.OntologyPSFLoader; import org.erasmusmc.ontology.OntologyStore; import org.erasmusmc.ontology.Relation; import org.erasmusmc.ontology.TermStore; import org.erasmusmc.ontology.ontologyutilities.OntologyCurator; import org.erasmusmc.ontology.ontologyutilities.OntologyUtilities; import org.erasmusmc.utilities.StringUtilities; import org.erasmusmc.utilities.TextFileUtilities; public class UMLSFilteringAfterOntologyCreation { public static int MESHVOC = -1000; public static int maxtermsize = 100; public static Pattern Retiredpattern = Pattern.compile("retired code", Pattern.CASE_INSENSITIVE); public static Pattern CurlyParenthesispattern = Pattern.compile("\\{.*\\}"); public static Pattern SquarebracketsCodingPattern = Pattern.compile("^\\[\\w{2}\\d{3}\\]"); public static Pattern AtsignPattern = Pattern.compile("@"); public static Pattern AtsignGeneClusterPattern = Pattern.compile("@ gene cluster"); public static Pattern NonEssentialParentheticals = Pattern.compile("(\\[X\\])|(\\[V\\])|\\[D\\]|\\[M\\]|\\[EDTA\\]|\\[SO\\]|\\[Q\\]"); public static Pattern DisorderPattern = Pattern.compile("\\(disorder\\)"); public static Pattern FindingPattern = Pattern.compile("\\(finding\\)"); public static Pattern xxxPattern = Pattern.compile("xxx", Pattern.CASE_INSENSITIVE); public static Pattern ECPattern = Pattern.compile("^EC\\s[0-9]+\\.", Pattern.CASE_INSENSITIVE); public static Pattern proteinWeightPattern = Pattern.compile("^[0-9]+ ?[kK][dD][aA]?$"); /**Patterns from UMLS rewrite rules project:*/ public static Pattern DosagePattern = Pattern.compile("(\\s\\d[\\d.]*\\s?((g )|(ug)|(mg)|(ml)|%)|(\\(ml\\))|(\\(mg\\))|(\\(gm\\))|(\\(ug\\)))", Pattern.CASE_INSENSITIVE); public static Pattern necPatternCombined = Pattern.compile("(,\\snec$)|(\\s\\(nec\\)$)|(\\s\\[nec\\]$)|(not elsewhere classified)|(unclassified)|(without mention)", Pattern.CASE_INSENSITIVE); public static Pattern nosPatternCombined = Pattern.compile("(,\\snos$)|(\\s\\(nos\\)$)|(\\s\\[nos\\]$)|(not otherwise specified)|(not specified)|(unspecified)", Pattern.CASE_INSENSITIVE); public static Pattern miscPatternCombined = Pattern.compile("(^|\\s)other(\\s|$)|(deprecated)|(unknown)|(obsolete)|(^no\\s+)|(miscellaneous)|(\\(MMHCC\\))", Pattern.CASE_INSENSITIVE); public static Set<Integer> filteredSemanticTypes = getSemanticTypesForFiltering(); public static Set<Integer> filteredSemanticTypesNotMesh = getSemanticTypesForFilteringNotMesh(); public static Set<String> stopwordsForFiltering = OntologyUtilities.stopwordsForFiltering; public static void main(String[] args) { String nameOfNewDatabase = "UMLS2006AD_AfterFiltering_031207"; // String nameOfNewDatabase = "test_AfterFiltering"; // Create log List<String> log_output = new ArrayList<String>(); // String logname = "/home/khettne/Toxicogenomics/Data/Indexing/UMLS_thesaurus_building/UMLS_filtering_log.log"; String logname = "/home/public/Thesauri/UMLS2006AD/UMLS_filtering_log_031207.log"; System.out.println("Loading original ontology " + StringUtilities.now()); OntologyPSFLoader loader = new OntologyPSFLoader(); //loader.loadFromPSF("/home/khettne/Toxicogenomics/Data/Indexing/UMLS_thesaurus_building/UMLS_2006AD_beforefiltering.psf"); loader.loadFromPSF("/home/public/Thesauri/UMLS2006AD/UMLS_2006AD_beforefiltering.psf"); //loader.loadFromPSF("/home/public/Thesauri/UMLS2006AD/test.psf"); OntologyStore originalOntology = loader.ontology; System.out.println("Creating new ontology " + StringUtilities.now()); Ontology newOntology = new OntologyStore(); newOntology.setName(nameOfNewDatabase); Set<Integer> includedCUIs = new HashSet<Integer>(); System.out.println("Checking rules " + StringUtilities.now()); Iterator<Concept> conceptIterator = originalOntology.getConceptIterator(); int lineCount = 0; while (conceptIterator.hasNext()) { lineCount++; if (lineCount % 1000 == 0) System.out.println(lineCount); Concept concept = conceptIterator.next(); if (concept.getID() > 0) { if (semanticFilter(concept, originalOntology)) { log_output.add("FILTERED OUT DUE TO BAD SEMANTIC TYPE|" + concept.getName() + "|" + concept.getID()); concept.setTerms(new ArrayList<TermStore>()); } /*if (VOCFilter(concept, originalOntology)) { log_output.add("FILTERED OUT DUE TO BAD VOC|" + concept.getName() + "|" + concept.getID()); concept.setTerms(new ArrayList<TermStore>()); }*/ Iterator<TermStore> termIterator = concept.getTerms().iterator(); while (termIterator.hasNext()) { TermStore term = termIterator.next(); // Check if the term satisfies Martijns rule or the other filter rules, and remove it if it does if (OntologyUtilities.MartijnsFilterRule(term.text, stopwordsForFiltering)) { termIterator.remove(); log_output.add("SATISFIES MARTIJNS RULE|" + term.text + "|" + concept.getID()); } else if (satisfiesUMLSfilterRules(concept.getID(), originalOntology, term)) { termIterator.remove(); log_output.add("FILTERED OUT DUE TO PATTERN MATCHING|" + term.text + "|" + concept.getID()); } } OntologyUtilities.removeDuplicateTerms(concept.getTerms()); } if (!concept.getTerms().isEmpty() || concept.getID() < 0) { includedCUIs.add(concept.getID()); newOntology.setConcept(concept); } } // Copy relationships: List<Relation> relations = originalOntology.getRelations(); for (Relation relation: relations) if (includedCUIs.contains(relation.subject)&& includedCUIs.contains(relation.object)) newOntology.setRelation(relation); // Copy databaseIDs: List<DatabaseID> databaseIDs; for (int cui: includedCUIs) { databaseIDs = originalOntology.getDatabaseIDsForConcept(cui); if (databaseIDs != null) for (DatabaseID databaseID: databaseIDs) newOntology.setDatabaseIDForConcept(cui, databaseID); } System.out.println("Writing logfile " + StringUtilities.now()); TextFileUtilities.saveToFile(log_output, logname); OntologyCurator curator = new OntologyCurator("/home/public/thesauri/UMLS2006AD/UMLS_curation_file.txt"); curator.curateAndPrepare(newOntology); loader.ontology = (OntologyStore) newOntology; //loader.SaveToPSF("/home/khettne/Toxicogenomics/Data/Indexing/UMLS_thesaurus_building/UMLS_2006AD_filtered.psf"); loader.saveToPSF("/home/public/Thesauri/UMLS2006AD/UMLS_2006AD_filtered_031207.psf"); } /*private static boolean VOCFilter(Concept concept, OntologyStore originalOntology) { return originalOntology.existsRelation(new Relation(concept.getID(), DefaultTypes.fromVocabulary, NCICTCAEVOC)); }*/ private static boolean semanticFilter(Concept concept, OntologyStore originalOntology) { List<Relation> relations = originalOntology.getRelationsForConceptAsSubject(concept.getID(), DefaultTypes.isOfSemanticType); boolean isMESH = originalOntology.existsRelation(new Relation(concept.getID(), DefaultTypes.fromVocabulary, MESHVOC)); for (Relation relation: relations) { boolean filter = false; if (filteredSemanticTypes.contains(relation.object)) { filter = true; } else if (!isMESH && filteredSemanticTypesNotMesh.contains(relation.object)) { filter = true; } if (!filter) { return false; } } return true; } private static Set<Integer> getSemanticTypesForFiltering() { Set<Integer> result = new TreeSet<Integer>(); result.add(-71); result.add(-185); result.add(-78); result.add(-171); result.add(-122); return result; } private static Set<Integer> getSemanticTypesForFilteringNotMesh() { Set<Integer> result = new TreeSet<Integer>(); result.add(-201); result.add(-200); result.add(-170); result.add(-97); result.add(-73); result.add(-74); result.add(-203); result.add(-79); result.add(-80); result.add(-81); result.add(-82); result.add(-83); result.add(-169); result.add(-77); result.add(-92); result.add(-93); result.add(-94); return result; } public static boolean satisfiesUMLSfilterRules(Integer conceptID, Ontology ontology, TermStore term) { String t = term.text; if (t.length() > maxtermsize && !(OntologyUtilities.isChemical(conceptID, ontology))) return true; if (DosagePattern.matcher(t).find()) return true; if (necPatternCombined.matcher(t).find()) return true; if (nosPatternCombined.matcher(t).find()) return true; if (miscPatternCombined.matcher(t).find()) return true; if (CurlyParenthesispattern.matcher(t).find() && !(OntologyUtilities.isChemical(conceptID, ontology))) return true; if (AtsignPattern.matcher(t).find() && !(AtsignGeneClusterPattern.matcher(t).find())) return true; if (Retiredpattern.matcher(t).find()) return true; if (xxxPattern.matcher(t).find()) return true; if (ECPattern.matcher(t).find()) return true; if (proteinWeightPattern.matcher(t).matches()) return true; return false; } }