/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.installer.pubchem; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.http.client.utils.URIBuilder; import org.apache.jena.arq.querybuilder.SelectBuilder; import org.apache.jena.graph.NodeFactory; import org.apache.jena.query.Query; import org.apache.jena.query.QueryExecution; import org.apache.jena.query.QueryExecutionFactory; import org.apache.jena.query.QuerySolution; import org.apache.jena.query.ResultSet; import org.apache.jena.sparql.core.Var; import org.apache.jena.vocabulary.RDF; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; /*** * The PubchemMeshSynonyms class provides an API to get Pubchem synonyms and MeSH terms given an InChI string. * It assumes that a Virtuoso SPARQL endpoint is running on a connected server, port 8890 and that the necessary data * has been loaded in. The default is connecting to 10.0.20.19 (Chimay). * TODO(thomas): create a Wiki page describing the Virtuoso setup and how to load the data * The HTML server can be accessed at http://<LOCAL IP OR Alias>:8890/sparql, with a UI to run SPARQL queries. Try it! */ public class PubchemMeshSynonyms { private static final Logger LOGGER = LogManager.getFormatterLogger(PubchemMeshSynonyms.class); public static final String OPTION_SERVICE_HOST = "s"; public static final String OPTION_SERVICE_PORT = "p"; public static final String OPTION_QUERY_INCHI = "i"; public static final String HELP_MESSAGE = "This class provides an API to get Pubchem synonyms and MeSH terms given an InChI string."; // The Virtuoso SPARQL endpoint, lives by default on Chimay at port 8890 private static final String DEFAULT_SERVICE_HOST = "localhost"; // chimay private static final String DEFAULT_SERVICE_PORT = "8890"; public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{ add(Option.builder(OPTION_SERVICE_HOST) .argName("SERVICE_HOST") .desc("The SPARQL server host. Default is " + DEFAULT_SERVICE_HOST) .hasArg() .longOpt("service-host") .type(String.class) ); add(Option.builder(OPTION_SERVICE_PORT) .argName("SERVICE_PORT") .desc("The SPARQL server host's port. Default is " + DEFAULT_SERVICE_PORT) .hasArg() .longOpt("service-port") .type(Integer.class) ); add(Option.builder(OPTION_QUERY_INCHI) .argName("QUERY_INCHI") .desc("The InChI string to fetch synonyms for. " + "For example, InChI=1S/C8H9NO2/c1-6(10)9-7-2-4-8(11)5-3-7/h2-5,11H,1H3,(H,9,10) representing APAP") .hasArg() .required() .longOpt("query-inchi") .type(String.class) ); }}; public static final HelpFormatter HELP_FORMATTER = new HelpFormatter(); static { HELP_FORMATTER.setWidth(100); } private static final String CID_PATTERN = "CID\\d+"; private static final String ENGLISH_LANG_TAG = "en"; private String sparqlService; /* The CID_QUERY_TMPL SelectBuilder constructs SPARQL queries like the below one: ######### PREFIX sio: <http://semanticscience.org/resource/> SELECT DISTINCT ?inchi_iri FROM <http://rdf.ncbi.nlm.nih.gov/pubchem/descriptor/compound> WHERE { ?inchi_iri sio:has-value "InChI=1S/C8H9NO2/c1-6(10)9-7-2-4-8(11)5-3-7/h2-5,11H,1H3,(H,9,10)"@en } ######### */ private static final SelectBuilder CID_QUERY_TMPL = new SelectBuilder() // PREFIX (shorthands for IRI namespaces which can be looked up on http://prefix.cc) .addPrefix("sio", "http://semanticscience.org/resource/") // SELECT .setDistinct(true) .addVar("inchi_iri") // FROM .from("http://rdf.ncbi.nlm.nih.gov/pubchem/descriptor/compound") // WHERE .addWhere("?inchi_iri", "sio:has-value", "?inchi_string" ) ; /* The PUBCHEM_SYNO_QUERY_TMPL SelectBuilder constructs SPARQL queries like the below one: ######### PREFIX sio: <http://semanticscience.org/resource/> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX compound: <http://rdf.ncbi.nlm.nih.gov/pubchem/compound/> SELECT DISTINCT ?value ?type FROM <http://rdf.ncbi.nlm.nih.gov/pubchem/synonym> WHERE { ?syno sio:is-attribute-of compound:CID1983 ; rdf:type ?type ; sio:has-value ?value } ######### */ private static final SelectBuilder PUBCHEM_SYNO_QUERY_TMPL = new SelectBuilder() // PREFIX (shorthands for IRI namespaces which can be looked up on http://prefix.cc) .addPrefix("sio", "http://semanticscience.org/resource/") .addPrefix("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") .addPrefix("compound", "http://rdf.ncbi.nlm.nih.gov/pubchem/compound/") // SELECT .setDistinct(true) .addVar("value") .addVar("type") // FROM .from("http://rdf.ncbi.nlm.nih.gov/pubchem/synonym") // WHERE .addWhere("?syno", "sio:is-attribute-of", "?compound") .addWhere("?syno", RDF.type, "?type") .addWhere("?syno", "sio:has-value", "?value") ; /* The MESH_TERMS_QUERY_TMPL SelectBuilder constructs SPARQL queries like the below one: ######### PREFIX sio: <http://semanticscience.org/resource/> PREFIX dcterms: <http://purl.org/dc/terms/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX compound: <http://rdf.ncbi.nlm.nih.gov/pubchem/compound/> PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#> SELECT DISTINCT ?concept_label ?lexical_tag FROM <http://rdf.ncbi.nlm.nih.gov/pubchem/synonym> FROM <http://id.nlm.nih.gov/mesh/> WHERE { ?syno sio:is-attribute-of compound:CID1983 ; dcterms:subject ?mesh_concept . ?mesh_concept rdfs:label ?concept_label ; meshv:preferredTerm ?mesh_term . ?mesh_term meshv:lexicalTag ?lexical_tag } ######### */ private static final SelectBuilder MESH_TERMS_QUERY_TMPL = new SelectBuilder() // PREFIX (shorthands for IRI namespaces which can be looked up on http://prefix.cc) .addPrefix("sio", "http://semanticscience.org/resource/") .addPrefix("rdfs", "http://www.w3.org/2000/01/rdf-schema#") .addPrefix("compound", "http://rdf.ncbi.nlm.nih.gov/pubchem/compound/") .addPrefix("dcterms", "http://purl.org/dc/terms/") .addPrefix("meshv", "http://id.nlm.nih.gov/mesh/vocab#") // SELECT .setDistinct(true) .addVar("concept_label") .addVar("lexical_tag") // FROM .from("http://rdf.ncbi.nlm.nih.gov/pubchem/synonym") .from("http://id.nlm.nih.gov/mesh/") // WHERE .addWhere("?syno", "sio:is-attribute-of", "?compound") .addWhere("?syno", "dcterms:subject", "?mesh_concept") .addWhere("?mesh_concept", "rdfs:label", "?concept_label") .addWhere("?mesh_concept", "meshv:preferredTerm", "?mesh_term") .addWhere("?mesh_term", "meshv:lexicalTag", "?lexical_tag") ; public static void main(final String[] args) { // Parse the command line options Options opts = new Options(); for (Option.Builder b : OPTION_BUILDERS) { opts.addOption(b.build()); } CommandLine cl = null; try { CommandLineParser parser = new DefaultParser(); cl = parser.parse(opts, args); } catch (ParseException e) { System.err.format("Argument parsing failed: %s\n", e.getMessage()); HELP_FORMATTER.printHelp(PubchemMeshSynonyms.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } if (cl.hasOption("help")) { HELP_FORMATTER.printHelp(PubchemMeshSynonyms.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); return; } String serviceHostIp = cl.getOptionValue(OPTION_SERVICE_HOST, DEFAULT_SERVICE_HOST); Integer servicePort = Integer.parseInt(cl.getOptionValue(OPTION_SERVICE_PORT, DEFAULT_SERVICE_PORT)); String queryInchi = cl.getOptionValue(OPTION_QUERY_INCHI); PubchemMeshSynonyms pubchemMeshSynonyms = new PubchemMeshSynonyms(serviceHostIp, servicePort); String cid = pubchemMeshSynonyms.fetchCIDFromInchi(queryInchi); if (cid != null) { Map<PubchemSynonymType, Set<String>> pubchemSynonyms = pubchemMeshSynonyms.fetchPubchemSynonymsFromCID(cid); LOGGER.info("Resulting Pubchem synonyms for %s are: %s", queryInchi, pubchemSynonyms); Map<MeshTermType, Set<String>> meshTerms = pubchemMeshSynonyms.fetchMeshTermsFromCID(cid); LOGGER.info("Resulting MeSH term s for %s are: %s", queryInchi, meshTerms); } else { LOGGER.info("No PubChem compound ID was found for the input InChI."); } } public PubchemMeshSynonyms() { sparqlService = getServiceFromHostParams(DEFAULT_SERVICE_HOST, Integer.parseInt(DEFAULT_SERVICE_PORT)); } public PubchemMeshSynonyms(String hostIp, Integer port) { sparqlService = getServiceFromHostParams(hostIp, port); } private String getServiceFromHostParams(String hostIp, Integer port) { URI uri = null; try { uri = new URIBuilder() .setScheme("http") .setHost(hostIp) .setPort(port) .setPath("/sparql") .build(); } catch (URISyntaxException e) { String msg = String.format("An error occurred when trying to build the SPARQL service URI: %s", e.getMessage()); LOGGER.error(msg); throw new RuntimeException(msg); } LOGGER.debug("Constructed the following URL for SPARQL service: %s", uri.toString()); return uri != null? uri.toString(): null; } public String fetchCIDFromInchi(String inchi) { // The clone method has its own implementation in the SelectBuilder. Thus safe to use! SelectBuilder sb = CID_QUERY_TMPL.clone(); // The inchi litteral needs to be create with a language tag, otherwise it will not match anything // See "Matching Litteral with Language Tags" (https://www.w3.org/TR/rdf-sparql-query/#matchLangTags) // for more information sb.setVar(Var.alloc("inchi_string"), NodeFactory.createLiteral(inchi, ENGLISH_LANG_TAG)); Query query = sb.build(); String result; LOGGER.debug("Executing SPARQL query: %s", query.toString()); try (QueryExecution qexec = QueryExecutionFactory.sparqlService(sparqlService, query)) { ResultSet results = qexec.execSelect(); // TODO: we assume here that there is at most one CID per InChI and return the first CID // Improve that behavior so we can stitch together many CID's synonyms. if (!results.hasNext()) { LOGGER.info("Could not find Pubchem Compound Id for input InChI %s", inchi); return null; } result = results.nextSolution().getResource("inchi_iri").getLocalName(); } String cid = extractCIDFromResourceName(result); LOGGER.info("Found Pubchem Compound Id %s for input InChI %s", cid, inchi); return cid; } private String extractCIDFromResourceName(String resourceName) { Pattern p = Pattern.compile(CID_PATTERN); Matcher m = p.matcher(resourceName); String cid = null; if (m.find()) { cid = m.group(0); } return cid; } public Map<PubchemSynonymType, Set<String>> fetchPubchemSynonymsFromCID(String cid) { // The clone method has its own implementation in the SelectBuilder. Thus safe to use! SelectBuilder sb = PUBCHEM_SYNO_QUERY_TMPL.clone(); sb.setVar(Var.alloc("compound"), String.format("compound:%s", cid)); Query query = sb.build(); LOGGER.debug("Executing SPARQL query: %s", query.toString()); Map<PubchemSynonymType, Set<String>> map = new HashMap<>(); try (QueryExecution queryExecution = QueryExecutionFactory.sparqlService(sparqlService, query)) { ResultSet results = queryExecution.execSelect(); while(results.hasNext()) { QuerySolution solution = results.nextSolution(); String cheminfId = solution.getResource("type").getLocalName(); String synonym = solution.getLiteral("value").getString(); LOGGER.debug("Found synonym %s with type %s", synonym, cheminfId); PubchemSynonymType synonymType = PubchemSynonymType.getByCheminfId(cheminfId); Set synonyms = map.get(synonymType); if (synonyms == null) { synonyms = new HashSet<>(); map.put(synonymType, synonyms); } synonyms.add(synonym); } } return map; } public Map<MeshTermType, Set<String>> fetchMeshTermsFromCID(String cid) { // The clone method has its own implementation in the SelectBuilder. Thus safe to use! SelectBuilder sb = MESH_TERMS_QUERY_TMPL.clone(); sb.setVar(Var.alloc("compound"), String.format("compound:%s", cid)); Query query = sb.build(); LOGGER.debug("Executing SPARQL query: %s", query.toString()); Map<MeshTermType, Set<String>> map = new HashMap<>(); try (QueryExecution queryExecution = QueryExecutionFactory.sparqlService(sparqlService, query)) { ResultSet results = queryExecution.execSelect(); while(results.hasNext()) { QuerySolution solution = results.nextSolution(); String conceptLabel = solution.getLiteral("concept_label").getString(); String lexicalTag = solution.getLiteral("lexical_tag").getString(); LOGGER.debug("Found term %s with tag %s", conceptLabel, lexicalTag); MeshTermType meshTermsType = MeshTermType.getByLexicalTag(lexicalTag); Set synonyms = map.get(meshTermsType); if (synonyms == null) { synonyms = new HashSet<>(); map.put(meshTermsType, synonyms); } synonyms.add(conceptLabel); } } return map; } }