/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package JochemBuilder.MergeOntologies; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.TreeSet; import org.erasmusmc.ids.DatabaseID; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.DefaultTypes; import org.erasmusmc.ontology.OntologyFileLoader; import org.erasmusmc.ontology.OntologyStore; import org.erasmusmc.ontology.Relation; import org.erasmusmc.ontology.TermStore; import org.erasmusmc.ontology.ontologyutilities.OntologyCurator; import org.erasmusmc.ontology.ontologyutilities.OntologyUtilities; import org.erasmusmc.utilities.StringUtilities; import org.erasmusmc.utilities.WriteTextFile; import JochemBuilder.SharedCurationScripts.CurateUsingManualCurationFile; import JochemBuilder.SharedCurationScripts.MoveINCHIfromDbid2Def; public class FinalizeOntology { public static int umlsSemID = -103; public static String umlsSemName = "Chemical"; public static int vocID = -3000; public static String vocName = "CHEMICAL"; public static String date = "12-01-2011"; public static String home = "/home/khettne/Projects/Jochem/"; //public static String oldOntology = home+"chem_merged_curated4_"+date+".ontology"; public static String oldOntology = home+"chem_merged_curated4_22-12-2010.ontology"; public static String newOntologyName = "Jochem_V1_5"; public static String newOntology = home+newOntologyName+".ontology"; public static String termsToRemove = "JochemTermsToRemove.txt"; public static String filterLog = home+"/Jochem_removedTerms_MartijnsRulerule"+date+".log"; public static String curatedLog = home+"/Jochem_removedTerms_"+date+".log"; public static boolean moveInchi = true; public static void main(String[] args) { WriteTextFile logfile = new WriteTextFile(filterLog); System.out.println("Loading ontology. "+StringUtilities.now()); OntologyFileLoader loader = new OntologyFileLoader(); OntologyStore oldChemOntology = new OntologyStore(); oldChemOntology = loader.load(oldOntology); OntologyStore newChemOntology = new OntologyStore(); newChemOntology.setName(newOntologyName); Concept semantictype = new Concept(umlsSemID); semantictype.setName(umlsSemName); newChemOntology.setConcept(semantictype); Concept vocabulary = new Concept(vocID); vocabulary.setName(vocName); newChemOntology.setConcept(vocabulary); System.out.println("Iterating. "+StringUtilities.now()); Iterator<Concept> conceptIterator = oldChemOntology.getConceptIterator(); //Re-enumerate ontology and remove terms based on Martijns rule Integer conceptCount = 4000000; int lineCount = 0; while (conceptIterator.hasNext()) { lineCount++; if (lineCount % 10000 == 0) System.out.println(lineCount); Concept concept = conceptIterator.next(); if (concept.getID()>0){ Concept copyConcept = new Concept(conceptCount); copyConcept.setName(concept.getName()); Iterator<TermStore> termIterator = concept.getTerms().iterator(); while (termIterator.hasNext()) { TermStore term = termIterator.next(); if (OntologyUtilities.MartijnsFilterRule(term.text, stopwordsForFiltering) || term.text.length()<OntologyUtilities.minTermSize){ termIterator.remove(); logfile.writeln(term.text + "|"+concept.getName() +"|"+ concept.getID()); } //else if (term.text.endsWith("ase") || term.text.endsWith("ASE")) // logfile.writeln(term.text + "|"+concept.getName() +"|"+ concept.getID()); // termIterator.remove(); } copyConcept.setTerms(concept.getTerms()); copyConcept.setDefinition(concept.getDefinition()); if (!copyConcept.getTerms().isEmpty()){ newChemOntology.setConcept(copyConcept); List<DatabaseID> databaseIDs = oldChemOntology.getDatabaseIDsForConcept(concept.getID()); for (DatabaseID databaseID: databaseIDs){ newChemOntology.setDatabaseIDForConcept(copyConcept.getID(), databaseID); } Relation vocRelation = new Relation(copyConcept.getID(), DefaultTypes.fromVocabulary, vocID); newChemOntology.setRelation(vocRelation); Relation semRelation = new Relation(copyConcept.getID(), DefaultTypes.isOfSemanticType, umlsSemID); newChemOntology.setRelation(semRelation); conceptCount++; } } } logfile.close(); //Remove terms based on medline frequency CurateUsingManualCurationFile curate = new CurateUsingManualCurationFile(); newChemOntology = curate.run(newChemOntology, curatedLog,termsToRemove); //Set flags OntologyCurator curator = new OntologyCurator(); curator.curateAndPrepare(newChemOntology); if (moveInchi){ MoveINCHIfromDbid2Def move = new MoveINCHIfromDbid2Def(); newChemOntology = move.run(newChemOntology); } System.out.println("Saving to ontology file. "+StringUtilities.now()); OntologyFileLoader loader2 = new OntologyFileLoader(); loader2.save(newChemOntology,newOntology); } private static Set<String> stopwordsForFiltering = getDefaultStopWordsForFiltering(); private static Set<String> getDefaultStopWordsForFiltering() { Set<String> result = new TreeSet<String>(); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(OntologyCurator.class.getResourceAsStream("DefaultStopwordsForFiltering.txt"))); try { while (bufferedReader.ready()) { result.add(bufferedReader.readLine()); } } catch (IOException e) { e.printStackTrace(); } return result; } }