/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.dataimport.UMLS; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.erasmusmc.medline.MedlineIterator; import org.erasmusmc.medline.MedlineListener; import org.erasmusmc.medline.MedlineRecord; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.Ontology; import org.erasmusmc.ontology.OntologyFileLoader; import org.erasmusmc.ontology.OntologyManager; import org.erasmusmc.ontology.OntologyStore; import org.erasmusmc.ontology.Relation; import org.erasmusmc.ontology.TermStore; import org.erasmusmc.peregrine.ConceptPeregrine; import org.erasmusmc.peregrine.ReleasedTerm; import org.erasmusmc.peregrine.ResultTerm; import org.erasmusmc.utilities.ReadTextFile; import org.erasmusmc.utilities.StringUtilities; import org.erasmusmc.utilities.WriteTextFile; public class UMLSMedlineFilter implements MedlineListener{ public String pmidFile = "/home/khettne/Public/PMIDs/all_2010.PMIDs"; public String tempFolder = "/home/khettne/temp/"; private ConceptPeregrine peregrine; private Map<Integer, Set<Integer>> cui2termids; public static String ontologySourceName = "/home/khettne/Projects/UMLS/2010AB/UMLS2010AB_180211.ontology"; public static String ontologyTargetOutputFile = "/home/khettne/Projects/UMLS/2010AB/UMLS2010AB_180211_medlinefilter.ontology"; /** * Filters a thesaurus so only terms found at least once in Medline remain * @param args */ public static void main(String[] args) { UMLSMedlineFilter filter = new UMLSMedlineFilter(); filter.filter(ontologySourceName,ontologyTargetOutputFile); } private void filter(String source, String target) { initPeregrine(loadOntologyForIndexing(source)); cui2termids = new HashMap<Integer, Set<Integer>>(); System.out.println(StringUtilities.now() + "\tIndexing medline"); MedlineIterator medlineIterator = new MedlineIterator(); medlineIterator.pmidsFile = pmidFile; medlineIterator.iterate(this); saveResults(); peregrine = null; System.gc(); loadResults(); filterOntology(loadOntologyForFiltering(source), target); System.out.println(StringUtilities.now() + "\tDone"); } private void filterOntology(OntologyStore ontology, String target) { System.out.println(StringUtilities.now() + "\tFiltering ontology"); OntologyStore removedOntology = new OntologyStore(); Iterator<Concept> iterator = ontology.iterator(); int origConcepts = 0; int origTerms = 0; int newConcepts = 0; int newTerms = 0; while (iterator.hasNext()){ Concept concept = iterator.next(); if (concept.getID() < 0) removedOntology.setConcept(concept); origConcepts++; origTerms += concept.getTerms().size(); Set<Integer> termIDs = cui2termids.get(concept.getID()); if (termIDs == null && concept.getID() > 0){ copyConcept(ontology, removedOntology, concept); iterator.remove(); } else { newConcepts++; if (newConcepts % 10000 == 0) System.out.println(newConcepts + " new concepts"); List<TermStore> terms = concept.getTerms(); List<TermStore> newList = new ArrayList<TermStore>(); List<TermStore> deletedList = new ArrayList<TermStore>(); for (int i = 0; i < terms.size(); i++) if (termIDs.contains(i)) newList.add(terms.get(i)); else deletedList.add(terms.get(i)); concept.setTerms(newList); newTerms += newList.size(); if (deletedList.size() != 0){ Concept copyConcept = copyConcept(ontology, removedOntology, concept); copyConcept.setTerms(deletedList); } } } OntologyFileLoader loader = new OntologyFileLoader(); System.out.println("Saving ontology"); loader.save(ontology,target); loader.save(removedOntology,tempFolder + "/umlsMedlineFilter_removedTerms.ontology"); System.out.println("Original: " + origConcepts + " concepts, " + origTerms + " terms"); System.out.println("Filtered: " + newConcepts + " concepts, " + newTerms + " terms"); } private Concept copyConcept(OntologyStore ontology, OntologyStore removedOntology, Concept concept) { Concept copy = new Concept(concept.getID()); List<TermStore> terms = new ArrayList<TermStore>(concept.getTerms()); copy.setTerms(terms); removedOntology.setConcept(copy); for (Relation relation : ontology.getRelationsForConceptAsSubject(concept.getID())) removedOntology.setRelation(relation); return copy; } private void saveResults() { System.out.println(StringUtilities.now() + "\tSaving results"); WriteTextFile out = new WriteTextFile(tempFolder +"umlsMedlineFilter_termIDs.txt"); for (Map.Entry<Integer, Set<Integer>> entry : cui2termids.entrySet()){ StringBuilder line = new StringBuilder(); line.append(entry.getKey()); line.append("\t"); line.append(StringUtilities.join(entry.getValue(), ";")); out.writeln(line.toString()); } out.close(); } private void loadResults() { cui2termids = new HashMap<Integer, Set<Integer>>(); ReadTextFile in = new ReadTextFile(tempFolder +"umlsMedlineFilter_termIDs.txt"); for (String line : in){ String[] cols = line.split("\t"); int cui = Integer.parseInt(cols[0]); String[] tids = cols[1].split(";"); Set<Integer> termIDs = new HashSet<Integer>(); for (String tid : tids) termIDs.add(Integer.parseInt(tid)); cui2termids.put(cui, termIDs); } } private Ontology loadOntologyForIndexing(String source){ Ontology ontology; if (source.toLowerCase().endsWith(".ontology")){ OntologyFileLoader loader = new OntologyFileLoader(); loader.setLoadTermsOnly(true); ontology = loader.load(source); } else { OntologyManager manager = new OntologyManager(); ontology = manager.fetchClient(source); } return ontology; } private OntologyStore loadOntologyForFiltering(String source){ OntologyStore ontology; if (source.toLowerCase().endsWith(".ontology")){ OntologyFileLoader loader = new OntologyFileLoader(); ontology = loader.load(source); } else { OntologyManager manager = new OntologyManager(); ontology = manager.fetchStoreFromDatabase(source); } return ontology; } private void initPeregrine(Ontology ontology) { System.out.println(StringUtilities.now() + "\tLoading ontology"); peregrine = new ConceptPeregrine(); peregrine.normaliser.loadCacheBinary("/home/public/Peregrine/standardNormCache2006.bin"); System.out.println("Normalizer cache size = " + peregrine.normaliser.getCacheSize()); peregrine.setOntology(ontology); peregrine.destroyOntologyDuringRelease = true; System.out.println(StringUtilities.now() + "\tReleasing ontology"); peregrine.release(); } @Override public void processMedlineRecords(List<MedlineRecord> records) { for (MedlineRecord record : records){ peregrine.index(record.titleAbsMesh()); for (ResultTerm resultTerm : peregrine.resultTerms){ ReleasedTerm releasedTerm = resultTerm.term; for (int i = 0; i < releasedTerm.conceptId.length; i++){ Set<Integer> termIDs = cui2termids.get(releasedTerm.conceptId[i]); if (termIDs == null){ termIDs = new HashSet<Integer>(1); cui2termids.put(releasedTerm.conceptId[i], termIDs); } termIDs.add(releasedTerm.termId[i]); } } } } }