/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.analysis.similarity; import act.server.NoSQLAPI; import act.shared.Chemical; import chemaxon.calculations.clean.Cleaner; import chemaxon.formats.MolFormatException; import chemaxon.formats.MolImporter; import chemaxon.reaction.ReactionException; import chemaxon.sss.SearchConstants; import chemaxon.sss.search.MolSearch; import chemaxon.sss.search.MolSearchOptions; import chemaxon.sss.search.SearchException; import chemaxon.struc.Molecule; import chemaxon.struc.MoleculeGraph; import chemaxon.util.MolHandler; import com.act.biointerpretation.mechanisminspection.Ero; import com.act.biointerpretation.mechanisminspection.ErosCorpus; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; public class ROBinning { private static final Logger LOGGER = LogManager.getFormatterLogger(ROBinning.class); private static final int SUBSTRATE_SIDE_OF_REACTION = 0; public static final String OPTION_DB = "d"; public static final String HELP_MESSAGE = StringUtils.join(new String[] { "This class does substructure matching of every chemical against the RO corpus and adds the matching substrates of", "ROs to the chemical DB" }, ""); public static final HelpFormatter HELP_FORMATTER = new HelpFormatter(); static { HELP_FORMATTER.setWidth(100); } public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{ add(Option.builder(OPTION_DB) .argName("db-name") .desc("The database name from which chemicals are read and updated") .hasArg().required() .longOpt("db-name") ); // Everybody needs a little help from their friends. add(Option.builder("h") .argName("help") .desc("Prints this help message") .longOpt("help") ); }}; // From https://docs.chemaxon.com/display/jchembase/Bond+specific+search+options. public static final MolSearchOptions SEARCH_OPTIONS = new MolSearchOptions(SearchConstants.SUBSTRUCTURE); static { SEARCH_OPTIONS.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4); } private ErosCorpus erosCorpus; private NoSQLAPI api; private Map<String, Pair<MolSearch, Set<Integer>>> smileToSearchQuery = new HashMap<>(); public static void main(String[] args) throws Exception { Options opts = new Options(); for (Option.Builder b : OPTION_BUILDERS) { opts.addOption(b.build()); } CommandLine cl = null; try { CommandLineParser parser = new DefaultParser(); cl = parser.parse(opts, args); } catch (ParseException e) { System.err.format("Argument parsing failed: %s\n", e.getMessage()); HELP_FORMATTER.printHelp(ROBinning.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } if (cl.hasOption("help")) { HELP_FORMATTER.printHelp(ROBinning.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); return; } String dbName = cl.getOptionValue(OPTION_DB); // We read and write to the same database NoSQLAPI api = new NoSQLAPI(dbName, dbName); ErosCorpus erosCorpus = new ErosCorpus(); erosCorpus.loadValidationCorpus(); ROBinning roBinning = new ROBinning(erosCorpus, api); roBinning.init(); roBinning.processChemicals(); } public ROBinning(ErosCorpus loadedCorpus, NoSQLAPI noSQLAPI) { erosCorpus = loadedCorpus; api = noSQLAPI; } public void init() throws MolFormatException { for (Ero ro : erosCorpus.getRos()) { String smartsNotation = ro.getRo(); for (String substrateSmile : extractSubstratesFromRO(smartsNotation)) { Pair<MolSearch, Set<Integer>> molSearchListPair = smileToSearchQuery.get(substrateSmile); if (molSearchListPair == null) { MolSearchOptions searchOptions = new MolSearchOptions(SearchConstants.SUBSTRUCTURE); searchOptions.setStereoModel(SearchConstants.STEREO_MODEL_LOCAL); searchOptions.setStereoSearchType(SearchConstants.STEREO_EXACT); MolSearch ms = new MolSearch(); ms.setSearchOptions(searchOptions); ms.setQuery(new MolHandler(substrateSmile, true).getMolecule()); Set<Integer> newRoList = new HashSet<>(); molSearchListPair = Pair.of(ms, newRoList); smileToSearchQuery.put(substrateSmile, molSearchListPair); } molSearchListPair.getRight().add(ro.getId()); } } } private List<String> extractSubstratesFromRO(String ro) { String[] splitReaction = ro.split(">>"); List<String> chemicalsInSmilesNotation = new ArrayList<>(); String substrateSide = splitReaction[SUBSTRATE_SIDE_OF_REACTION]; // Split the substrate side of the reaction to it's individual components. A reaction is structured as such: // molA.molB>>molC.molD, so we have to split on the '.' on the substrate side. String[] substrateSmiles = substrateSide.split("\\."); for (String substrateSmile : substrateSmiles) { // Remove all the redundant chemical labeling from the processed RO. substrateSmile = substrateSmile.replaceAll(":[0-9]+", ""); chemicalsInSmilesNotation.add(substrateSmile); } return chemicalsInSmilesNotation; } private List<Integer> rosThatMatchTargetMolecule(Molecule target) throws SearchException { Set<Integer> matchedResults = new HashSet<>(); for (Map.Entry<String, Pair<MolSearch, Set<Integer>>> entry : smileToSearchQuery.entrySet()) { MolSearch searcher = entry.getValue().getLeft(); searcher.setTarget(target); // int[][] hits is an array containing the matches as arrays or null if there are no hits. // The match arrays contain the atom indexes of the target atoms that match the query atoms // (in the order of the appropriate query atoms). If there is substructure match, there atleast is one // hit with a length > 0. int[][] hits = searcher.findAll(); if (hits != null) { for (int i = 0; i < hits.length; i++) { if (hits[i].length > 0) { matchedResults.addAll(entry.getValue().getRight()); break; } } } } List<Integer> result = new ArrayList<>(); result.addAll(matchedResults); return result; } protected void processChemicals() throws IOException, ReactionException, SearchException { Iterator<Chemical> chemicals = api.readChemsFromInKnowledgeGraph(); while (chemicals.hasNext()) { Chemical chem = chemicals.next(); List<Integer> matchedRos = processChemical(chem.getInChI()); if (matchedRos != null && matchedRos.size() > 0) { api.getWriteDB().updateChemicalWithRoBinningInformation(chem.getUuid(), matchedRos); } } } public List<Integer> processChemical(String inchi) throws SearchException { Molecule molecule; try { molecule = MolImporter.importMol(inchi, "inchi"); } catch (Exception e) { LOGGER.error(e.getMessage()); return null; } return processChemical(molecule); } public List<Integer> processChemical(Molecule molecule) throws SearchException { Cleaner.clean(molecule, 2); molecule.aromatize(MoleculeGraph.AROM_BASIC); List<Integer> results = rosThatMatchTargetMolecule(molecule); Collections.sort(results); return results; } }