/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package JochemBuilder.chemIDplus;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Pattern;
import jregex.MatchIterator;
import jregex.MatchResult;
import jregex.Matcher;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.DefaultTypes;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.OntologyStore;
import org.erasmusmc.ontology.Relation;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.ontology.ontologyutilities.OntologyUtilities;
import org.erasmusmc.utilities.ReadTextFile;
import org.erasmusmc.utilities.StringUtilities;
import org.erasmusmc.utilities.WriteTextFile;
import casperSoftwareCode.Rules;
public class ChemicalsFromChemIDplus {
public Pattern indexNameNotYetAssignedPattern = Pattern.compile("Index name not yet assigned", Pattern.CASE_INSENSITIVE);
public jregex.Pattern bracketsPattern = new jregex.Pattern("\\[[^]]*\\]");
public int generalChemicalVocID = -3000;
public String generalChemicalVocName = "CHEMICAL";
public int specificChemicalVocID = -3001;
public String specificChemicalVocName = "CHEMIDPLUS";
public int beginSemID = -300;
public int umlsSemID = -103;
public String umlsSemName = "Chemical";
public boolean inTag = false;
public boolean first = false;
public int cID = 6000000;
public StringBuffer record = new StringBuffer();
public String nameList = "";
public String semList = "";
public OntologyStore chemIDplusOntology = new OntologyStore();
public List<String> countriesAndLanguages = getCountriesAndLanguages();
public Set<Integer> foundSemTypesForConcept = null;
public Concept concept = null;
public OntologyStore run(String filename, String semanticTypesMergeLog){
//Set the standard vocabulary and umls semantic type concepts
Concept generalVocabulary = new Concept(generalChemicalVocID);
generalVocabulary.setName(generalChemicalVocName);
chemIDplusOntology.setConcept(generalVocabulary);
Concept specificVocabulary = new Concept(specificChemicalVocID);
specificVocabulary.setName(specificChemicalVocName);
chemIDplusOntology.setConcept(specificVocabulary);
Concept semantictype = new Concept(umlsSemID);
semantictype.setName(umlsSemName);
chemIDplusOntology.setConcept(semantictype);
System.out.println("Processing ChemIDplus XML file " + StringUtilities.now());
processFile(filename);
System.out.println("Merging similar semantic types " + StringUtilities.now());
WriteTextFile out = new WriteTextFile(semanticTypesMergeLog);
String name = "";
Map<String,Integer> nameToCui = new HashMap<String,Integer>();
Iterator<Concept> it = chemIDplusOntology.getConceptIterator();
while (it.hasNext()){
Concept concept = it.next();
if (concept.getID()<=-300 && concept.getID()>=-400){
name = concept.getName();
String modifiedName = Rules.makeLowerCaseAndRemoveEos(name);
Integer cui = nameToCui.get(modifiedName);
if (cui!=null){
out.writeln(cui+"\t"+concept.getID()+"\t"+chemIDplusOntology.getConcept(cui).getName()+"\t"+concept.getName());
mergeSemanticTypes(chemIDplusOntology, concept.getID(), cui, true);
}else nameToCui.put(modifiedName, concept.getID());
}
}
out.close();
System.out.println("Writing logfile " + StringUtilities.now());
return chemIDplusOntology;
}
private void processFile(String chemIDplusFile){
ReadTextFile textFile = new ReadTextFile(chemIDplusFile);
Iterator<String> lineIterator = textFile.getIterator();
int tagCount = 0;
while (lineIterator.hasNext()) {
String line = lineIterator.next();
if (line.length() != 0) {
Integer beginTagIndex = line.toLowerCase().indexOf("<chemical");
if (!inTag && beginTagIndex != -1){
inTag = true;
first = true;
tagCount++;
if (tagCount % 10000 == 0)
System.out.println(tagCount);
}
if (inTag){
processTag(line, beginTagIndex);
}
}
}
}
private void processTag(String line, Integer beginTagIndex){
Integer endTagIndex = line.toLowerCase().indexOf("</chemical>");
if (endTagIndex == -1){
if (first){
String firstline = line.substring(beginTagIndex, line.length()-1);
record.append(firstline);
first = false;
}
else record.append(line);
}
if (endTagIndex != -1){
String substring = line.substring(0, endTagIndex);
record.append(substring);
storeRecord();
record = new StringBuffer();
inTag = false;
}
}
private void storeRecord(){
String chemID = getChemID();
String displayName = getChemName();
if (chemID.length() != 0 && displayName.length() != 0 && !indexNameNotYetAssignedPattern.matcher(displayName).find()){
cID++;
concept = new Concept(cID);
foundSemTypesForConcept = new TreeSet<Integer>();
List<TermStore> terms = concept.getTerms();
//Store displayName as name and term
if (displayName.length() < 255 && !termNotEnglish(displayName)){
//concept.setName(displayName);
terms.add(new TermStore(displayName));
}
// Store displayFormula as term
String displayFormula = getFormula();
if (displayFormula.length()!=0 && !displayFormula.contains("Unspecified") && !displayFormula.contains("unspecified") && !displayFormula.contains("UNSPECIFIED"))
terms.add(new TermStore(displayFormula));
//Store chemIdplus database ID
DatabaseID databaseID = new DatabaseID("CHID", chemID);
chemIDplusOntology.setDatabaseIDForConcept(concept.getID(), databaseID);
//Store CAS number as database ID
if (record.indexOf("<CASRegistryNumber>") != -1){
String casnr = getCASnr();
if (!casnr.equals("Not valid")){
DatabaseID casID = new DatabaseID("CAS", casnr);
chemIDplusOntology.setDatabaseIDForConcept(concept.getID(), casID);
}
}
//Create list of names
Integer beginNameIndex = record.indexOf("<NameList>");
Integer endNameIndex = record.indexOf("</NameList>");
if (beginNameIndex!=-1 && endNameIndex!=-1){
nameList = record.substring(beginNameIndex, endNameIndex);
//Store name of substance as term
if (nameList.indexOf("<NameOfSubstance>") != -1){
terms = addNameOfSubstance(terms);
}
// Store systematic name as term
if (nameList.indexOf("<SystematicName>") != -1){
terms = addSystematicNames(terms);
}
//Store synonyms as terms
if (nameList.indexOf("<Synonyms>") != -1){
terms = addSynonyms(terms);
}
nameList = "";
}
//Set terms
if (!terms.isEmpty()){
// Replace double quotation mark with single if there is only one mark in the string
for (TermStore termToCheck: terms){
int i = 0;
char currentchar;
int numberOfQuotationMarks = 0;
while (i < termToCheck.text.length()){
currentchar = termToCheck.text.charAt(i);
if (currentchar =='"'){
numberOfQuotationMarks++;
}
i++;
}
if (numberOfQuotationMarks==1){
termToCheck.text = termToCheck.text.replace('"', '\'');
}
}
// Set terms and remove duplicates
concept.setTerms(terms);
OntologyUtilities.removeDuplicateTerms(concept.getTerms());
}
//Set definition
if (record.indexOf("<Note>") != -1){
String definition = findDefinition();
concept.setDefinition(definition);
}
//Set semantic types
if (record.indexOf("<ClassificationCode>") != -1){
Integer beginSemIndex = record.indexOf("<ClassificationList>");
Integer endSemIndex = record.indexOf("</ClassificationList>");
if (beginSemIndex!=-1 && endSemIndex!=-1){
semList = record.substring(beginSemIndex, endSemIndex);
List<String> semStrings = findClassificationCodes();
List<String> semStrings2 = findSuperListClassificationCodes();
semStrings.addAll(semStrings2);
for (String semString: semStrings){
addSemanticType(semString);
}
semList = "";
}
}
//Set concept
if (concept != null) {
if (concept.getTerms().size() != 0)
chemIDplusOntology.setConcept(concept);
}
//Set vocabularies and standard semantic type
Relation generalVocRelation = new Relation(concept.getID(), DefaultTypes.fromVocabulary, generalChemicalVocID);
chemIDplusOntology.setRelation(generalVocRelation);
Relation specificVocRelation = new Relation(concept.getID(), DefaultTypes.fromVocabulary, specificChemicalVocID);
chemIDplusOntology.setRelation(specificVocRelation);
Relation semRelation = new Relation(concept.getID(), DefaultTypes.isOfSemanticType, umlsSemID);
chemIDplusOntology.setRelation(semRelation);
}
}
private String getChemName(){
String name = record.substring(record.indexOf("displayName=\""), record.indexOf(">", record.indexOf("displayName=\"")));
String finalDisplayName = name.substring(13, name.lastIndexOf("\""));
name = finalDisplayName.trim();
return name;
}
private String getFormula(){
String formula = record.substring(record.indexOf("displayFormula=\""), record.indexOf("displayName")).trim();
String finalDisplayFormula = formula.substring(16, formula.lastIndexOf("\""));
formula = finalDisplayFormula.trim();
return formula;
}
private String getChemID(){
String chemID = record.substring(record.indexOf("Chemical id=\""), record.indexOf("displayFormula")).trim();
chemID = chemID.substring(13, chemID.lastIndexOf("\""));
return chemID;
}
private String getCASnr(){
int substringIndex = record.indexOf("<SourceList", record.indexOf("<CASRegistryNumber>"));
/** int substringIndex = record.indexOf("<SourceList>", record.indexOf("<CASRegistryNumber>"));
if(substringIndex < 0){
maybe we have an empty sourceList tag
substringIndex = record.indexOf("<SourceList/>", record.indexOf("<CASRegistryNumber>"));
}*/
String casnr = record.substring(record.indexOf("<CASRegistryNumber>"), substringIndex).trim();
String finalCasNR = casnr.substring(19);
if (validCASnr(finalCasNR)){
casnr = finalCasNR;
return casnr;
}
return "Not valid";
}
private boolean validCASnr(String cas){
if (cas.contains("-")){
String[] parts = cas.split("-");
if (parts.length==3){
if (parts[0].length() > 0 && parts[0].length() < 8 && parts[1].length() == 2 && parts[2].length() == 1){
String concatCas = parts[0]+parts[1];
int[] intArray = new int[concatCas.length()];
for (int i = 0; i < concatCas.length(); i++) {
if (Character.digit(concatCas.charAt(i), 10)==-1) return false;
intArray[i] = Character.digit(concatCas.charAt(i), 10);
}
int sum = 0;
int check = intArray.length-1;
for (int i = 1; i < intArray.length+1; i++){
sum = sum + intArray[check]*i;
check--;
}
if ((sum % 10) == Integer.parseInt(parts[2]))
return true;
}
}
}
return false;
}
private List<TermStore> addSynonyms(List<TermStore> terms){
String[] columns = nameList.split("<Synonyms>");
for (String column : columns){
if (column.contains("</Synonyms>")){
int substringIndex = column.indexOf("<SourceList");
/** int substringIndex = column.indexOf("<SourceList>");
if(substringIndex < 0){
// we did not find the tag, perhaps it was empty
substringIndex = column.indexOf("<SourceList/>");
}*/
String synonym = column.substring(0, substringIndex).trim();
if (synonym.length()!=0 && !termNotEnglish(synonym) && synonym.length() < 255 && !synonym.contains("Beilstein Handbook Reference")){
terms.add(new TermStore(synonym));
}
}
}
return terms;
}
private List<TermStore> addSystematicNames(List<TermStore> terms){
String[] columns = nameList.split("<SystematicName>");
for (String column : columns){
if (column.contains("</SystematicName>")){
int substringIndex = column.indexOf("<SourceList");
/** int substringIndex = column.indexOf("<SourceList>");
if(substringIndex < 0){
// we did not find the tag, perhaps it was empty
substringIndex = column.indexOf("<SourceList/>");
}*/
String synonym = column.substring(0, substringIndex).trim();
if (synonym.length()!=0 && !termNotEnglish(synonym) && synonym.length() < 255 && !synonym.contains("Beilstein Handbook Reference")){
terms.add(new TermStore(synonym));
}
}
}
return terms;
}
private List<TermStore> addNameOfSubstance(List<TermStore> terms){
String[] columns = nameList.split("<NameOfSubstance>");
for (String column : columns){
if (column.contains("</NameOfSubstance>")){
int substringIndex = column.indexOf("<SourceList");
/** int substringIndex = column.indexOf("<SourceList>");
if(substringIndex < 0){
// we did not find the tag, perhaps it was empty
substringIndex = column.indexOf("<SourceList/>");
}*/
String synonym = column.substring(0, substringIndex).trim();
if (synonym.length()!=0 && !termNotEnglish(synonym) && synonym.length() < 255 && !synonym.contains("Beilstein Handbook Reference")){
terms.add(new TermStore(synonym));
}
}
}
return terms;
}
private List<String> findClassificationCodes(){
List<String> semTypes = new ArrayList<String>();
String[] columns = semList.split("<ClassificationCode>");
for (String column : columns){
if (column.contains("</ClassificationCode>")){
String voc = StringUtilities.findBetween(column, "<Source>", "</Source>").trim();
int substringIndex = column.indexOf("<SourceList>");
if(substringIndex < 0){
substringIndex = column.indexOf("<SourceList/>");
}
String semType = column.substring(0, substringIndex).trim();
if (voc.equalsIgnoreCase("RTECS") || voc.equalsIgnoreCase("IARC") || voc.equalsIgnoreCase("NTPA")){
if (semType.equalsIgnoreCase("DNA topoisomerase II inhibitors")) semType = "DNA topoisomerase II inhibitor";
if (semType.equalsIgnoreCase("Skin / eye Irritation")) semType = "Skin / Eye Irritant";
if (semType.equalsIgnoreCase("Skin /eye Irritation")) semType = "Skin / Eye Irritant";
if (semType.equalsIgnoreCase("Reasonlly anticipated as a human carcinogen")) semType = "Reasonably anticipated to be a human carcinogen";
if (semType.equalsIgnoreCase("Overall Carcinogenic Evaluation: 2A")) semType = "Overall Carcinogenic Evaluation: Group 2A";
if (semType.equalsIgnoreCase("Overall Carcinogenic Evaluation: 2B")) semType = "Overall Carcinogenic Evaluation: Group 2B";
if (semType.equalsIgnoreCase("Overall Carcinogenic Evaluation: Group 2B (auramine, technical-grade)")) semType = "Overall Carcinogenic Evaluation: Group 2B";
if (semType.equalsIgnoreCase("Overall Carcinogenic Evaluation: Group 1 (manufacture of auramine)")) semType = "Overall Carcinogenic Evaluation: Group 1";
semTypes.add(semType);
}
}
}
return semTypes;
}
private List<String> findSuperListClassificationCodes(){
List<String> semTypes = new ArrayList<String>();
String[] columns = semList.split("<SuperlistClassCode>");
for (String column : columns){
if (column.contains("</SuperlistClassCode>")){
String voc = StringUtilities.findBetween(column, "<Source>", "</Source>").trim();
int substringIndex = column.indexOf("<SourceList>");
if(substringIndex < 0){
substringIndex = column.indexOf("<SourceList/>");
}
String semType = column.substring(0, substringIndex).trim();
if (voc.equalsIgnoreCase("IARC") || voc.equalsIgnoreCase("NTPA")){
if (semType.equalsIgnoreCase("DNA topoisomerase II inhibitors")) semType = "DNA topoisomerase II inhibitor";
if (semType.equalsIgnoreCase("Skin / eye Irritation")) semType = "Skin / Eye Irritant";
if (semType.equalsIgnoreCase("Skin /eye Irritation")) semType = "Skin / Eye Irritant";
if (semType.equalsIgnoreCase("Reasonlly anticipated as a human carcinogen")) semType = "Reasonably anticipated to be a human carcinogen";
if (semType.equalsIgnoreCase("Overall Carcinogenic Evaluation: 2A")) semType = "Overall Carcinogenic Evaluation: Group 2A";
if (semType.equalsIgnoreCase("Overall Carcinogenic Evaluation: 2B")) semType = "Overall Carcinogenic Evaluation: Group 2B";
if (semType.equalsIgnoreCase("Overall Carcinogenic Evaluation: Group 2B (auramine, technical-grade)")) semType = "Overall Carcinogenic Evaluation: Group 2B";
if (semType.equalsIgnoreCase("Overall Carcinogenic Evaluation: Group 1 (manufacture of auramine)")) semType = "Overall Carcinogenic Evaluation: Group 1";
semTypes.add(semType);
}
}
}
return semTypes;
}
private boolean termNotEnglish(String name){
boolean found = false;
Matcher m = bracketsPattern.matcher(name);
MatchIterator mi = m.findAll();
while(mi.hasMore()){
MatchResult mr=mi.nextMatch();
String match = mr.toString().toLowerCase();
Iterator listiterator = countriesAndLanguages.iterator();
while (listiterator.hasNext()){
String term = listiterator.next().toString().toLowerCase();
if (match.contains(term)){
found = true;
}
}
}
return found;
}
private ArrayList<String> getCountriesAndLanguages() {
ArrayList<String> countries = new ArrayList<String>();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(ChemicalsFromChemIDplus.class.getResourceAsStream("countriesAndLanguages.txt")));
try {
while (bufferedReader.ready()) {
countries.add(bufferedReader.readLine().trim());
}
} catch (IOException e) {
e.printStackTrace();
}
return countries;
}
private Map<String, Integer> semanticTypes = new HashMap<String, Integer>();
private void addSemanticType(String semString){
Integer semID = semanticTypes.get(semString);
if (semID == null) {
semID = beginSemID - semanticTypes.size();
semanticTypes.put(semString, semID);
Concept semanticType = new Concept(semID);
semanticType.setName(semString);
chemIDplusOntology.setConcept(semanticType);
}
if (!foundSemTypesForConcept.contains(semID)) {
Relation relation = new Relation(concept.getID(), DefaultTypes.isOfSemanticType, semID);
chemIDplusOntology.setRelation(relation);
foundSemTypesForConcept.add(semID);
}
}
private String findDefinition(){
int substringIndex = record.indexOf("<SourceList>", record.indexOf("<Note>"));
if(substringIndex < 0){
substringIndex = record.indexOf("<SourceList/>", record.indexOf("<Note>"));
}
String definition = record.substring(record.indexOf("<Note>"), substringIndex).trim();
String finalDefinition = definition.substring(6);
// If longer than 1024 characters, then substring and add a full stop.
if (!finalDefinition.endsWith(".") && finalDefinition.length()<=1024){
finalDefinition = finalDefinition+".";
} else if (!finalDefinition.endsWith(".") && finalDefinition.length()>1024){
finalDefinition = finalDefinition.substring(0, 1023)+".";
}
definition = finalDefinition;
return definition;
}
private static void mergeSemanticTypes(Ontology ontology, int fromCUI, int toCUI, boolean removeFromConcept) {
if (fromCUI == toCUI) {
System.out.println("ERROR: attempted to merge " + fromCUI + " to itself!");
}
else {
Concept fromConcept = ontology.getConcept(fromCUI);
Concept toConcept = ontology.getConcept(toCUI);
if (fromConcept != null && toConcept != null) {
String name = fromConcept.getName();
toConcept.setName(name);
List<Relation> fromRelationsSub = ontology.getRelationsForConceptAsSubject(fromCUI);
List<Relation> fromRelationsObj = ontology.getRelationsForConceptAsObject(fromCUI);
List<DatabaseID> fromDbIDs = ontology.getDatabaseIDsForConcept(fromCUI);
if (!fromConcept.getDefinition().equals("")) {
String def = fromConcept.getDefinition();
if (!toConcept.getDefinition().equals("")) {
def = toConcept.getDefinition() + ";" + def;
}
toConcept.setDefinition(def);
}
for (Relation relation: fromRelationsObj) {
relation.object = toCUI;
ontology.setRelation(relation);
}
for (Relation relation: fromRelationsSub) {
relation.subject = toCUI;
ontology.setRelation(relation);
}
for (DatabaseID databaseID: fromDbIDs) {
ontology.setDatabaseIDForConcept(toCUI, databaseID);
}
if (removeFromConcept)
ontology.removeConcept(fromCUI);
}
else {
System.out.println("Attempted merge with a non existing Concept: either " + toCUI + " and/or " + fromCUI);
}
}
}
}