/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package JochemBuilder.DrugBank;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import jregex.MatchIterator;
import jregex.MatchResult;
import jregex.Matcher;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.DefaultTypes;
import org.erasmusmc.ontology.OntologyStore;
import org.erasmusmc.ontology.Relation;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.ontology.ontologyutilities.OntologyUtilities;
import org.erasmusmc.utilities.ReadTextFile;
import org.erasmusmc.utilities.StringUtilities;
import JochemBuilder.chemIDplus.ChemicalsFromChemIDplus;
public class ChemicalsFromDrugBank {
public int generalChemicalVocID = -3000;
public String generalChemicalVocName = "CHEMICAL";
public int specificChemicalVocID = -3003;
public String specificChemicalVocName = "DRUGBANK";
public int umlsSemID = -103;
public String umlsSemName = "Chemical";
public jregex.Pattern bracketsPattern = new jregex.Pattern("\\[[^]]*\\]");
public boolean inTag = false;
public boolean first = false;
public int cID = 3000000;
public StringBuffer record = new StringBuffer();
public OntologyStore drugbankOntology = new OntologyStore();
public List<String> countriesAndLanguages = getCountriesAndLanguages();
public Set<Integer> foundSemTypesForConcept = null;
public Concept concept = null;
public Map<String, Integer> semanticTypes = new HashMap<String, Integer>();
public OntologyStore run(String filename){
//Set the standard vocabulary and umls semantic type concepts
Concept generalVocabulary = new Concept(generalChemicalVocID);
generalVocabulary.setName(generalChemicalVocName);
drugbankOntology.setConcept(generalVocabulary);
Concept specificVocabulary = new Concept(specificChemicalVocID);
specificVocabulary.setName(specificChemicalVocName);
drugbankOntology.setConcept(specificVocabulary);
Concept semantictype = new Concept(umlsSemID);
semantictype.setName(umlsSemName);
drugbankOntology.setConcept(semantictype);
System.out.println("Processing drug cards file " + StringUtilities.now());
processFile(filename);
System.out.println("Writing DrugBank ontology " + StringUtilities.now());
return drugbankOntology;
}
private void processFile(String drugBankFile){
ReadTextFile textFile = new ReadTextFile(drugBankFile);
Iterator<String> lineIterator = textFile.getIterator();
int tagCount = 0;
while (lineIterator.hasNext()) {
String line = lineIterator.next();
if (line.length() != 0) {
Integer beginTagIndex = line.toLowerCase().indexOf("#begin_drugcard");
if (!inTag && beginTagIndex != -1){
inTag = true;
first = true;
tagCount++;
if (tagCount % 10000 == 0)
System.out.println(tagCount);
}
if (inTag){
processTag(line, beginTagIndex);
}
}
}
}
private void processTag(String line, Integer beginTagIndex){
Integer endTagIndex = line.toLowerCase().indexOf("#end_drugcard");
if (endTagIndex == -1){
if (first){
String firstline = line.substring(beginTagIndex, line.length());
record.append(firstline+"\t");
first = false;
}
else record.append(line+"\t");
}
if (endTagIndex != -1){
String substring = line.substring(0, endTagIndex);
record.append(substring);
storeRecord();
record = new StringBuffer();
inTag = false;
}
}
private void storeRecord(){
cID++;
concept = new Concept(cID);
foundSemTypesForConcept = new TreeSet<Integer>();
String name = "";
String term = "";
List<TermStore> terms = new ArrayList<TermStore>();
List<DatabaseID> databaseIds = new ArrayList<DatabaseID>();
String definition = "";
// String sem = "";
// List<String> sems = new ArrayList<String>();
String recordString = record.toString();
String[] columns = recordString.split("#");
for (String column: columns ){
column = column.trim();
if (column.startsWith("BEGIN_DRUGCARD")){
String[] parts = column.split(" ");
String dbNr = parts[1].trim();
DatabaseID dbID = new DatabaseID("DRUG", dbNr);
databaseIds.add(dbID);
}else if (column.startsWith("Brand_Names")){
String[] parts = column.split("\t");
for (String part: parts){
if (!part.contains("Brand_Names") && part.length()<=255 && !part.contains("Not Available") && !termNotEnglish(part)){
term = part.trim();
terms.add(new TermStore(term));
}
}
}else if (column.startsWith("CAS_Registry_Number")){
String[] parts = column.split("\t");
String dbNr = parts[1].trim();
if (!dbNr.contains("Not Available")){
DatabaseID dbID = new DatabaseID("CAS", dbNr);
databaseIds.add(dbID);
}
}else if (column.startsWith("InChI_Identifier")){
String[] parts = column.split("\t");
String dbNr = parts[1].trim();
if (!dbNr.contains("Not Available")){
DatabaseID dbID = new DatabaseID("INCH", dbNr);
databaseIds.add(dbID);
}
}else if (column.startsWith("ChEBI_ID")){
String[] parts = column.split("\t");
String dbNr = parts[1].trim();
if (!dbNr.contains("Not Available")){
DatabaseID dbID = new DatabaseID("CHEB", dbNr);
databaseIds.add(dbID);
}
}
else if (column.startsWith("Chemical_Formula")){
String[] parts = column.split("\t");
for (String part: parts){
if (!part.contains("Chemical_Formula") && part.length()<=255 && !part.contains("Not Available") && !termNotEnglish(part)){
term = part.trim();
terms.add(new TermStore(term));
}
}
}else if (column.startsWith("Chemical_IUPAC_Name")){
String[] parts = column.split("\t");
for (String part: parts){
if (!part.contains("Chemical_IUPAC_Name") && part.length()<=255 && !part.contains("Not Available") && !termNotEnglish(part)){
term = part.trim();
terms.add(new TermStore(term));
}
}
}else if (column.startsWith("Description")){
String[] parts = column.split("\t");
if (!parts[1].contains("Not Available"))
definition = definition + parts[1].trim();
}/**else if (column.startsWith("Drug_Category")){
String[] parts = column.split("\t");
for (String part: parts){
if (!part.contains("Drug_Category") && part.length()<=255 && !part.contains("Not Available")){
sem = part.trim();
sems.add(sem);
}
}
} */ else if (column.startsWith("Generic_Name")){
String[] parts = column.split("\t");
for (String part: parts){
if (!part.contains("Generic_Name") && part.length()<=255 && !part.contains("Not Available") && !termNotEnglish(part)){
term = part.trim();
name = term;
terms.add(new TermStore(term));
}
}
}else if (column.startsWith("KEGG_Compound_ID")){
String[] parts = column.split("\t");
String dbNr = parts[1].trim();
if (!dbNr.contains("Not Available")){
DatabaseID dbID = new DatabaseID("KEGG", dbNr);
databaseIds.add(dbID);
}
}else if (column.startsWith("KEGG_Drug_ID")){
String[] parts = column.split("\t");
String dbNr = parts[1].trim();
if (!dbNr.contains("Not Available")){
DatabaseID dbID = new DatabaseID("KEGD", dbNr);
databaseIds.add(dbID);
}
}else if (column.startsWith("Mechanism_Of_Action")){
String[] parts = column.split("\t");
if (!parts[1].contains("Not Available"))
definition = definition + parts[1].trim();
}else if (column.startsWith("PubChem_Compound_ID")){
String[] parts = column.split("\t");
String dbNr = parts[1].trim();
if (!dbNr.contains("Not Available")){
DatabaseID dbID = new DatabaseID("PUBC", dbNr);
databaseIds.add(dbID);
}
}else if (column.startsWith("PubChem_Substance_ID")){
String[] parts = column.split("\t");
String dbNr = parts[1].trim();
if (!dbNr.contains("Not Available")){
DatabaseID dbID = new DatabaseID("PUBS", dbNr);
databaseIds.add(dbID);
}
}else if (column.startsWith("Synonyms")){
String[] parts = column.split("\t");
for (String part: parts){
if (!part.contains("Synonyms") && part.length()<=255 && !part.contains("Not Available") && !termNotEnglish(part)){
term = part.trim();
terms.add(new TermStore(term));
}
}
}else if (column.startsWith("Toxicity")){
String[] parts = column.split("\t");
if (!parts[1].contains("Not Available"))
definition = definition + parts[1].trim();
}
}
//Set terms
if (terms.size() != 0){
//Replace double quotation mark with single if there is only one mark in the string
for (TermStore termToCheck: terms){
int i = 0;
char currentchar;
int numberOfQuotationMarks = 0;
while (i < termToCheck.text.length()){
currentchar = termToCheck.text.charAt(i);
if (currentchar =='"'){
numberOfQuotationMarks++;
}
i++;
}
if (numberOfQuotationMarks==1){
termToCheck.text = termToCheck.text.replace('"', '\'');
// System.out.println(termToCheck.text);
}
}
// Set terms and remove duplicates
terms.add(0, (new TermStore(name)));
concept.setTerms(terms);
OntologyUtilities.removeDuplicateTerms(terms);
}
//Set definition. If longer than 1024 characters, then substring and add a full stop.
if (definition.length()!=0){
if (!definition.endsWith(".") && definition.length()<=1024){
definition = definition+".";
} else if (!definition.endsWith(".") && definition.length()>1024){
definition = definition.substring(0, 1023)+".";
}
concept.setDefinition(definition);
}
/** //Set semantic types
if (!sems.isEmpty()){
for (String semString: sems){
Integer semID = semanticTypes.get(semString);
if (semID == null) {
semID = -300 - semanticTypes.size();
semanticTypes.put(semString, semID);
Concept semanticType = new Concept(semID);
semanticType.setName(semString);
drugbankOntology.setConcept(semanticType);
}
if (!foundSemTypesForConcept.contains(semID)) {
Relation relation = new Relation(concept.getID(), DefaultTypes.isOfSemanticType, semID);
drugbankOntology.setRelation(relation);
foundSemTypesForConcept.add(semID);
}
}
}*/
//Set database IDs
if (!databaseIds.isEmpty()){
for (DatabaseID databaseId: databaseIds){
drugbankOntology.setDatabaseIDForConcept(concept.getID(), databaseId);
}
}
// Set concept
if (concept != null) {
if (concept.getTerms().size() != 0)
drugbankOntology.setConcept(concept);
}
//Set vocabularies and standard semantic type
Relation generalVocRelation = new Relation(concept.getID(), DefaultTypes.fromVocabulary, generalChemicalVocID);
drugbankOntology.setRelation(generalVocRelation);
Relation specificVocRelation = new Relation(concept.getID(), DefaultTypes.fromVocabulary, specificChemicalVocID);
drugbankOntology.setRelation(specificVocRelation);
Relation semRelation = new Relation(concept.getID(), DefaultTypes.isOfSemanticType, umlsSemID);
drugbankOntology.setRelation(semRelation);
}
private boolean termNotEnglish(String name){
boolean found = false;
Matcher m = bracketsPattern.matcher(name);
MatchIterator mi = m.findAll();
while(mi.hasMore()){
MatchResult mr=mi.nextMatch();
String match = mr.toString().toLowerCase();
Iterator listiterator = countriesAndLanguages.iterator();
while (listiterator.hasNext()){
String term = listiterator.next().toString().toLowerCase();
if (match.contains(term)){
found = true;
}
}
}
return found;
}
private ArrayList<String> getCountriesAndLanguages() {
ArrayList<String> countries = new ArrayList<String>();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(ChemicalsFromChemIDplus.class.getResourceAsStream("countriesAndLanguages.txt")));
try {
while (bufferedReader.ready()) {
countries.add(bufferedReader.readLine().trim());
}
} catch (IOException e) {
e.printStackTrace();
}
return countries;
}
}