/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package JochemBuilder.HMDB; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.erasmusmc.ids.DatabaseID; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.DefaultTypes; import org.erasmusmc.ontology.OntologyStore; import org.erasmusmc.ontology.Relation; import org.erasmusmc.ontology.TermStore; import org.erasmusmc.ontology.ontologyutilities.OntologyUtilities; import org.erasmusmc.utilities.ReadTextFile; public class ChemicalsFromHMDB { public int generalChemicalVocID = -3000; public String generalChemicalVocName = "CHEMICAL"; public int specificChemicalVocID = -3004; public String specificChemicalVocName = "HMDB"; public int umlsSemID = -103; public String umlsSemName = "Chemical"; public OntologyStore run(String filename){ OntologyStore ontology = new OntologyStore(); //Set the standard vocabulary and umls semantic type concepts Concept generalVocabulary = new Concept(generalChemicalVocID); generalVocabulary.setName(generalChemicalVocName); ontology.setConcept(generalVocabulary); Concept specificVocabulary = new Concept(specificChemicalVocID); specificVocabulary.setName(specificChemicalVocName); ontology.setConcept(specificVocabulary); Concept semantictype = new Concept(umlsSemID); semantictype.setName(umlsSemName); ontology.setConcept(semantictype); ReadTextFile file = new ReadTextFile(filename); Iterator<String> iterator = file.getIterator(); boolean name = false; String preferredName = ""; boolean synonyms = false; boolean databaseLinks = false; boolean description = false; List<TermStore> terms = new ArrayList<TermStore>(); List<DatabaseID> databaseIds = new ArrayList<DatabaseID>(); String term = ""; String database = ""; String definition = ""; String synonymString = ""; int cui = 7000000; int lineCount = 0; while(iterator.hasNext()){ lineCount++; if (lineCount % 1000000 == 0) System.out.println(lineCount); String line = iterator.next(); if (line.startsWith("# hmdb_id:")){ databaseLinks = true; database = "HMDB"; } else if (line.startsWith("# name:")){ name = true; } else if (line.startsWith("# cas_number:")){ databaseLinks = true; database = "CAS"; } else if (line.startsWith("# chebi_id:")){ databaseLinks = true; database = "CHEB"; } else if (line.startsWith("# chemical_formula:")){ name = true; } else if (line.startsWith("# iupac:")){ name = true; } else if (line.startsWith("# description:")){ description = true; } else if (line.startsWith("# pubchem_compound_id:")){ databaseLinks = true; database = "PUBC"; } else if (line.startsWith("# pubchem_substance_id:")){ databaseLinks = true; database = "PUBS"; } else if (line.startsWith("# kegg_compound_id:")){ databaseLinks = true; database = "KEGG"; } else if (line.startsWith("# synonyms:")){ synonyms = true; } else if (line.startsWith("# inchi_identifier:")){ databaseLinks = true; database = "INCH"; } if (databaseLinks){ if (!(line.startsWith(" ") || line.startsWith("#"))){ String id = line.trim(); if (!id.contains("Not Available")){ if (!database.equals("INCH")){ String[] ids = id.split(";"); for (String dbId: ids){ if (id.length()>65535) System.out.println(id); databaseIds.add(new DatabaseID(database, dbId)); } } else { String dbId = id; databaseIds.add(new DatabaseID(database, dbId)); } } databaseLinks = false; } } if (name){ if (!(line.startsWith(" ") || line.startsWith("#"))){ term = line.trim(); if (!term.contains("Not Available") && term.length()<256){ preferredName = term; terms.add(new TermStore(term)); } name = false; } } if (synonyms){ if (!(line.startsWith(" ") || line.startsWith("#"))){ synonymString = line.trim(); if (!synonymString.contains("Not Available")){ String[] names = synonymString.split(";"); for (String string: names) if (string.length()<256) terms.add(new TermStore(string.trim())); } synonyms = false; } } if (description){ if (!(line.startsWith(" ") || line.startsWith("#"))){ definition = line.trim(); if (!definition.contains("Not Available")){ definition = line.trim(); } else definition = ""; description = false; } } else if (line.startsWith("#END_METABOCARD")){ if (terms.size() != 0){ Concept concept = new Concept(cui++); // Replace double quotation mark with single if there is only one mark in the string for (TermStore termToCheck: terms){ int i = 0; char currentchar; int numberOfQuotationMarks = 0; while (i < termToCheck.text.length()){ currentchar = termToCheck.text.charAt(i); if (currentchar =='"'){ numberOfQuotationMarks++; } i++; } if (numberOfQuotationMarks==1){ termToCheck.text = termToCheck.text.replace('"', '\''); // System.out.println(termToCheck.text); } } // Set terms and remove duplicates if (preferredName.length()!=0) terms.add(0, (new TermStore(preferredName))); concept.setTerms(terms); OntologyUtilities.removeDuplicateTerms(terms); // Set definition. If longer than 1024 characters, then substring and add a full stop. if (definition.length()!=0){ if (!definition.endsWith(".") && definition.length()<=1024){ definition = definition+"."; } else if (!definition.endsWith(".") && definition.length()>1024){ definition = definition.substring(0, 1023)+"."; } concept.setDefinition(definition); } //Set databaseIDs for (DatabaseID databaseId: databaseIds){ ontology.setDatabaseIDForConcept(concept.getID(), databaseId); } //Set concept ontology.setConcept(concept); //Set vocabularies and standard semantic type Relation generalVocRelation = new Relation(concept.getID(), DefaultTypes.fromVocabulary, generalChemicalVocID); ontology.setRelation(generalVocRelation); Relation specificVocRelation = new Relation(concept.getID(), DefaultTypes.fromVocabulary, specificChemicalVocID); ontology.setRelation(specificVocRelation); Relation semRelation = new Relation(concept.getID(), DefaultTypes.isOfSemanticType, umlsSemID); ontology.setRelation(semRelation); terms = new ArrayList<TermStore>(); databaseIds = new ArrayList<DatabaseID>(); definition = ""; term = ""; database = ""; synonymString = ""; preferredName = ""; } } } return ontology; } }