/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package JochemBuilder.ChEBI;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.DefaultTypes;
import org.erasmusmc.ontology.OntologyStore;
import org.erasmusmc.ontology.Relation;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.ontology.ontologyutilities.OntologyUtilities;
import org.erasmusmc.utilities.ReadTextFile;
public class ChemicalsFromChEBI {
public int generalChemicalVocID = -3000;
public String generalChemicalVocName = "CHEMICAL";
public int specificChemicalVocID = -3002;
public String specificChemicalVocName = "CHEBI";
public int umlsSemID = -103;
public String umlsSemName = "Chemical";
public OntologyStore run(String filename){
OntologyStore ontology = new OntologyStore();
//Set the standard vocabulary and umls semantic type concepts
Concept generalVocabulary = new Concept(generalChemicalVocID);
generalVocabulary.setName(generalChemicalVocName);
ontology.setConcept(generalVocabulary);
Concept specificVocabulary = new Concept(specificChemicalVocID);
specificVocabulary.setName(specificChemicalVocName);
ontology.setConcept(specificVocabulary);
Concept semantictype = new Concept(umlsSemID);
semantictype.setName(umlsSemName);
ontology.setConcept(semantictype);
ReadTextFile file = new ReadTextFile(filename);
Iterator<String> iterator = file.getIterator();
boolean synonym = false;
List<TermStore> terms = null;
List<DatabaseID> databaseIds = new ArrayList<DatabaseID>();
String chebiID = "";
String term = "";
String definition = "";
boolean databaseLinks = false;
String dbIdString = "";
int cui = 9000000;
int lineCount = 0;
while(iterator.hasNext()){
lineCount++;
if (lineCount % 10000 == 0)
System.out.println(lineCount);
String line = iterator.next();
if (line.startsWith("name")){
term = line.substring("name:".length()).trim();
terms = new ArrayList<TermStore>();
if (term.length()<255)
terms.add(new TermStore(term));
} else if (line.startsWith("synonym:")){
synonym = true;
} else if (line.startsWith("id:")){
line = line.substring("id: ".length());
line = line.trim().split(":")[1];
chebiID = line.trim();
} else if (line.startsWith("def:")){
line = line.substring("def: ".length()+1).trim();
definition = line.substring(0, line.indexOf(" [")-1);
} else if (line.startsWith("xref:")){
databaseLinks = true;
}
if (databaseLinks){
if (line.startsWith("xref:")){
line = line.substring("xref: ".length()).trim();
dbIdString = line.trim();
String[] ids = dbIdString.split(":");
String dbName = ids[0].trim();
String dbId = ids[1].trim();
if (dbName.equals("KEGG COMPOUND")){
if (dbId.contains("CAS Registry Number")){
dbId = dbId.substring(0, dbId.indexOf("CAS Registry Number")-1).trim();
databaseIds.add(new DatabaseID("CAS", dbId));
}else if (dbId.contains("KEGG COMPOUND")){
dbId = dbId.substring(0, dbId.indexOf("KEGG COMPOUND")-1).trim();
databaseIds.add(new DatabaseID("KEGG", dbId));
}
}
if (dbName.equals("KEGG DRUG")){
if (dbId.contains("CAS Registry Number")){
dbId = dbId.substring(0, dbId.indexOf("CAS Registry Number")-1).trim();
databaseIds.add(new DatabaseID("CAS", dbId));
}else if (dbId.contains("KEGG DRUG")){
dbId = dbId.substring(0, dbId.indexOf("KEGG DRUG")-1).trim();
databaseIds.add(new DatabaseID("KEGG", dbId));
}
}
if (dbName.equals("ChemIDplus")){
if (dbId.contains("CAS Registry Number")){
dbId = dbId.substring(0, dbId.indexOf("CAS Registry Number")-1).trim();
databaseIds.add(new DatabaseID("CAS", dbId));
}
}
if (dbName.equals("NIST Chemistry WebBook")){
if (dbId.contains("CAS Registry Number")){
dbId = dbId.substring(0, dbId.indexOf("CAS Registry Number")-1).trim();
databaseIds.add(new DatabaseID("CAS", dbId));
}
}
if (dbName.equals("ChEBI")){
if (dbId.contains("KEGG COMPOUND")){
dbId = dbId.substring(0, dbId.indexOf("KEGG COMPOUND")-1).trim();
databaseIds.add(new DatabaseID("KEGG", dbId));
}
}
if (dbName.equals("DrugBank")){
if (dbId.contains("DrugBank")){
dbId = dbId.substring(0, dbId.indexOf("DrugBank")-1).trim();
databaseIds.add(new DatabaseID("DRUG", dbId));
}
}
} else databaseLinks = false;
}
if (synonym){
if (line.startsWith("synonym:") && !line.contains("RELATED InChI") && !line.contains("RELATED SMILES")){
// remove formulas: if (line.startsWith("synonym:") && !line.contains("RELATED FORMULA") && !line.contains("RELATED InChI") && !line.contains("RELATED SMILES")){
term = line.substring("synonym: ".length()+1).trim();
if (term.indexOf("RELATED")!=-1)
term = term.substring(0, term.indexOf("RELATED")-2).trim();
else if (term.indexOf("EXACT")!=-1)
term = term.substring(0, term.indexOf("EXACT")-2).trim();
if (term.length()<255)
terms.add(new TermStore(term));
}else if(line.startsWith("synonym:") && line.contains("InChI=")){
String dbId = line.substring("synonym: ".length()+1).trim();
if (dbId.indexOf("RELATED")!=-1)
dbId = dbId.substring(0, dbId.indexOf("RELATED")-2).trim();
else if (dbId.indexOf("EXACT")!=-1)
dbId = dbId.substring(0, dbId.indexOf("EXACT")-2).trim();
databaseIds.add(new DatabaseID("INCH", dbId));
} else synonym = false;
}
else if (line.length()==0){
if (!chebiID.equals("23091") && !chebiID.equals("24431") && !chebiID.equals("23367") && terms!=null && terms.size() != 0){
Concept concept = new Concept(cui++);
//Replace double quotation mark with single if there is only one mark in the string
for (TermStore termToCheck: terms){
int i = 0;
char currentchar;
int numberOfQuotationMarks = 0;
while (i < termToCheck.text.length()){
currentchar = termToCheck.text.charAt(i);
if (currentchar =='"'){
numberOfQuotationMarks++;
}
i++;
}
if (numberOfQuotationMarks==1){
termToCheck.text = termToCheck.text.replace('"', '\'');
}
}
//Set terms and remove duplicates
concept.setTerms(terms);
OntologyUtilities.removeDuplicateTerms(terms);
//Set definition. If longer than 1024 characters, then substring and add a full stop.
if (definition.length()!=0){
if (!definition.endsWith(".") && definition.length()<=1024){
definition = definition+".";
} else if (!definition.endsWith(".") && definition.length()>1024){
definition = definition.substring(0, 1023)+".";
}
concept.setDefinition(definition);
}
//Set databaseIDs
databaseIds.add(new DatabaseID("CHEB", chebiID));
for (DatabaseID databaseId: databaseIds){
ontology.setDatabaseIDForConcept(concept.getID(), databaseId);
}
//Set concept
ontology.setConcept(concept);
//Set vocabularies and standard semantic type
Relation generalVocRelation = new Relation(concept.getID(), DefaultTypes.fromVocabulary, generalChemicalVocID);
ontology.setRelation(generalVocRelation);
Relation specificVocRelation = new Relation(concept.getID(), DefaultTypes.fromVocabulary, specificChemicalVocID);
ontology.setRelation(specificVocRelation);
Relation semRelation = new Relation(concept.getID(), DefaultTypes.isOfSemanticType, umlsSemID);
ontology.setRelation(semRelation);
databaseIds = new ArrayList<DatabaseID>();
definition = "";
chebiID = "";
term = "";
}
}
if (line.startsWith("[Typedef]")){
break;
}
}
return ontology;
}
}