/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package JochemBuilder.EvaluationScripts; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.erasmusmc.ids.DatabaseID; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.OntologyFileLoader; import org.erasmusmc.ontology.OntologyStore; import org.erasmusmc.utilities.StringUtilities; import org.erasmusmc.utilities.WriteTextFile; public class GetChemicalThesaurusStatistics { public static String home = "/home/khettne/Projects/Jochem/"; public static String ontologyFile = home+"Jochem_V1_2.ontology"; public static String statistics = home+"Jochem_V1_2.statistics"; public static String casAndInchi = home+"Jochem_V1_2_casAndInchi.txt"; public static void main(String[] args) { System.out.println("Starting script "+StringUtilities.now()); WriteTextFile out = new WriteTextFile(statistics); WriteTextFile out2 = new WriteTextFile(casAndInchi); System.out.println("Loading ontology "+StringUtilities.now()); OntologyFileLoader ontologyLoader = new OntologyFileLoader(); OntologyStore ontology = ontologyLoader.load(ontologyFile); Iterator<Concept> conceptIterator = ontology.getConceptIterator(); //Set<String> casnumbers = new TreeSet<String>(); Map<String, Integer> casMap = new HashMap<String, Integer>(); //Set<String> inchiCodes = new TreeSet<String>(); Map<String, Integer> inChiMap = new HashMap<String, Integer>(); Map<Integer, String> cuiToDbidMap = new HashMap<Integer, String>(); Map<String, List<Integer>> casOrInChIToCuiMap = new HashMap<String, List<Integer>>(); int lineCount = 0; while (conceptIterator.hasNext()) { lineCount++; if (lineCount % 10000 == 0) System.out.println(lineCount); Concept concept = conceptIterator.next(); if (concept.getID()>0){ int noOfConceptCAS = 0; int noOfConceptInChi = 0 ; noOfConcepts++; int termListSize = concept.getTerms().size(); noOfTerms = noOfTerms + termListSize; List<DatabaseID> dbIds = ontology.getDatabaseIDsForConcept(concept.getID()); for (DatabaseID id: dbIds){ if (id.database.equals("CAS")){ noOfCAS++; noOfConceptCAS++; List<Integer> cuis = casOrInChIToCuiMap.get(id.ID); if (cuis==null){ cuis = new ArrayList<Integer>(); cuis.add(concept.getID()); casOrInChIToCuiMap.put(id.ID, cuis); } else { cuis.add(concept.getID()); casOrInChIToCuiMap.put(id.ID, cuis); } if (!casMap.containsKey(id.ID)){ casMap.put(id.ID, 1); } else { Integer value = casMap.get(id.ID); if (value==null){ casMap.put(id.ID, 2); } else { value = value+1; casMap.put(id.ID, value); } } } if (id.database.equals("INCH")){ noOfInChI++; noOfConceptInChi++; List<Integer> cuis = casOrInChIToCuiMap.get(id.ID); if (cuis==null){ cuis = new ArrayList<Integer>(); cuis.add(concept.getID()); casOrInChIToCuiMap.put(id.ID, cuis); } else { cuis.add(concept.getID()); casOrInChIToCuiMap.put(id.ID, cuis); } if (!inChiMap.containsKey(id.ID)){ inChiMap.put(id.ID, 1); } else { Integer value = inChiMap.get(id.ID); if (value==null){ inChiMap.put(id.ID, 2); } else { value = value+1; inChiMap.put(id.ID, value); } } } if (id.database.equals("PUBC")) noOfPubChemC++; if (id.database.equals("PUBS")) noOfPubChemS++; if (id.database.equals("KEGD")) noOfKEGGd++; if (id.database.equals("KEGG")) noOfKEGGc++; if (id.database.equals("CHEB")) noOfChebi++; if (id.database.equals("DRUG")){ noOfDrugBank++; cuiToDbidMap.put(concept.getID(), id.ID); } if (id.database.equals("HMDB")) noOfHmbd++; if (id.database.equals("CHID")) noOfChemIDplus++; } if (noOfConceptCAS==1){ noOf1CAS++; out2.writeln(concept.getID().toString()+"\t"+"1"); } else if (noOfConceptCAS==2){ noOf2CAS++; out2.writeln(concept.getID().toString()+"\t"+"2"); } else if (noOfConceptCAS==3){ noOf3CAS++; out2.writeln(concept.getID().toString()+"\t"+"3"); } else if (noOfConceptCAS==4){ noOf4CAS++; out2.writeln(concept.getID().toString()+"\t"+"4"); } else if (noOfConceptCAS==5){ noOf5CAS++; out2.writeln(concept.getID().toString()+"\t"+"5"); } else if (noOfConceptCAS>=5 && noOfConceptCAS<=9){ noOf5To9CAS++; out2.writeln(concept.getID().toString()+"\t"+"5-9"); } else if (noOfConceptCAS>=10 && noOfConceptCAS<=19){ noOf10To19CAS++; out2.writeln(concept.getID().toString()+"\t"+"10-19"); }else if (noOfConceptCAS>=20){ noOfmorethan20CAS++; out2.writeln(concept.getID().toString()+"\t"+">=20"); } if (noOfConceptInChi==1){ noOf1InChI++; out2.writeln(concept.getID().toString()+"\t"+"1"); } else if (noOfConceptInChi==2){ noOf2InChI++; out2.writeln(concept.getID().toString()+"\t"+"2"); } else if (noOfConceptInChi==3){ noOf3InChI++; out2.writeln(concept.getID().toString()+"\t"+"3"); } else if (noOfConceptInChi==4){ noOf4InChI++; out2.writeln(concept.getID().toString()+"\t"+"4"); } else if (noOfConceptInChi==5){ noOf5InChI++; out2.writeln(concept.getID().toString()+"\t"+"5"); } else if (noOfConceptInChi>=5 && noOfConceptInChi<=9){ noOf5To9InChI++; out2.writeln(concept.getID().toString()+"\t"+"5-9"); } else if (noOfConceptInChi>=10 && noOfConceptInChi<=19){ noOf10To19InChI++; out2.writeln(concept.getID().toString()+"\t"+"10-19"); }else if (noOfConceptInChi>=20){ noOfmorethan20InChI++; out2.writeln(concept.getID().toString()+"\t"+">=20"); } } } noOfUniqueCAS = casMap.size(); noOfUniqueInChI = inChiMap.size(); out.writeln("Concepts: "+noOfConcepts); out.writeln("Terms: "+noOfTerms); out.writeln("CAS numbers: "+noOfCAS); out.writeln("Unique CAS numbers: "+noOfUniqueCAS); out.writeln("InChI codes: "+noOfInChI); out.writeln("Unique InChI codes: "+noOfUniqueInChI); out.writeln("PubChem Compound ref: "+noOfPubChemC); out.writeln("PubChem Substance ref: "+noOfPubChemS); out.writeln("KEGG drug ref: "+noOfKEGGd); out.writeln("KEGG compound ref: "+noOfKEGGc); out.writeln("ChEBI ref: "+noOfChebi); out.writeln("DrugBank ref: "+noOfDrugBank); out.writeln("HMDB ref: "+noOfHmbd); out.writeln("ChemIDplus ref: "+noOfChemIDplus); out.writeln("\n"); out.writeln("Concepts with more than one CAS number or InChI:"); out.writeln("2 CAS: "+noOf2CAS); out.writeln("3 CAS: "+noOf3CAS); out.writeln("4 CAS: "+noOf4CAS); out.writeln("5 CAS: "+noOf5CAS); out.writeln("5-9 CAS: "+noOf5To9CAS); out.writeln("10-19 CAS: "+noOf10To19CAS); out.writeln(">20 CAS: "+noOfmorethan20CAS); out.writeln("2 InChI: "+noOf2InChI); out.writeln("3 InChI: "+noOf3InChI); out.writeln("4 InChI: "+noOf4InChI); out.writeln("5 InChI: "+noOf5InChI); out.writeln("5-9 InChI: "+noOf5To9InChI); out.writeln("10-19 InChI: "+noOf10To19InChI); out.writeln(">20 InChI: "+noOfmorethan20InChI); out.close(); out2.close(); System.out.println("Done! "+StringUtilities.now()); } private static int noOfTerms = 0; private static int noOfConcepts = 0 ; private static int noOfCAS = 0; private static int noOf1CAS = 0; private static int noOf2CAS = 0; private static int noOf3CAS = 0; private static int noOf4CAS = 0; private static int noOf5CAS = 0; private static int noOf5To9CAS = 0; private static int noOf10To19CAS = 0; private static int noOfmorethan20CAS = 0; private static int noOfUniqueCAS = 0; private static int noOfInChI = 0; private static int noOf1InChI = 0; private static int noOf2InChI = 0; private static int noOf3InChI = 0; private static int noOf4InChI = 0; private static int noOf5InChI = 0; private static int noOf5To9InChI = 0; private static int noOf10To19InChI = 0; private static int noOfmorethan20InChI = 0; private static int noOfUniqueInChI = 0; private static int noOfPubChemC = 0; private static int noOfPubChemS = 0; private static int noOfKEGGd = 0; private static int noOfKEGGc = 0; private static int noOfChebi = 0; private static int noOfDrugBank = 0; private static int noOfHmbd = 0; private static int noOfChemIDplus = 0; }