/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.biointerpretation.Utils; import chemaxon.marvin.io.MolExportException; import chemaxon.reaction.ConcurrentReactorProcessor; import chemaxon.reaction.ReactionException; import chemaxon.reaction.Reactor; import chemaxon.sss.SearchConstants; import chemaxon.sss.search.MolSearch; import chemaxon.sss.search.MolSearchOptions; import chemaxon.sss.search.SearchException; import chemaxon.struc.Molecule; import chemaxon.struc.RxnMolecule; import chemaxon.util.iterator.MoleculeIterator; import chemaxon.util.iterator.MoleculeIteratorFactory; import com.act.analysis.chemicals.molecules.MoleculeExporter; import com.act.analysis.chemicals.molecules.MoleculeFormat; import com.act.analysis.chemicals.molecules.MoleculeFormat$; import org.apache.commons.collections4.Bag; import org.apache.commons.collections4.bag.HashBag; import org.apache.commons.collections4.iterators.PermutationIterator; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; public class ReactionProjector { private static final Logger LOGGER = LogManager.getFormatterLogger(ReactionProjector.class); private static final String MOL_NOT_FOUND = "NOT_FOUND"; private static final MolSearchOptions LAX_SEARCH_OPTIONS = new MolSearchOptions(SearchConstants.SUBSTRUCTURE); private static final MolSearch DEFAULT_SEARCHER = new MolSearch(); private static final String DEFAULT_MOLECULE_FORMAT = MoleculeFormat.noAuxInchi$.MODULE$.toString(); private final String moleculeFormat; /** * Set searcher to ignore stereo and bond type. This will allow us to optimistically match products so that we don't * end up throwing out a reactor that produces the right compound in a slightly different form. */ static { LAX_SEARCH_OPTIONS.setStereoSearchType(SearchConstants.STEREO_IGNORE); LAX_SEARCH_OPTIONS.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4); DEFAULT_SEARCHER.setSearchOptions(LAX_SEARCH_OPTIONS); } private transient ThreadLocal<MolSearch> substrateSearcher = new ThreadLocal<MolSearch>() { @Override protected MolSearch initialValue() { MolSearch search = new MolSearch(); /* These parameters were identified by trial and error as reasonable defaults for doing substructure searches for * reactor substrate matching. */ MolSearchOptions options = new MolSearchOptions(SearchConstants.SUBSTRUCTURE); // This allows H's in RO strings to match implicit hydrogens in our target molecules. options.setImplicitHMatching(SearchConstants.IMPLICIT_H_MATCHING_ENABLED); /* This allows for vague bond matching in ring structures. From the Chemaxon Docs: * In the query all single ring bonds are replaced by "single or aromatic" and all double ring bonds are * replaced by "double or aromatic" prior to search. * (https://www.chemaxon.com/jchem/doc/dev/java/api/chemaxon/sss/SearchConstants.html) * * This should allow us to handle aromatized molecules gracefully without handling non-ring single and double * bonds ambiguously. */ options.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL2); // Few if any of our ROs concern stereo chemistry, so we can just ignore it. options.setStereoSearchType(SearchConstants.STEREO_IGNORE); /* Chemaxon's tautomer handling is weird, as sometimes it picks a non-representative tautomer as its default. * As such, we'll allow tautomer matches to avoid excluding viable candidates. */ options.setTautomerSearch(SearchConstants.TAUTOMER_SEARCH_ON); search.setSearchOptions(options); return search; } }; private boolean useSubstructureFiltering = false; private MolSearch searcher; public ReactionProjector() { this(DEFAULT_SEARCHER, DEFAULT_MOLECULE_FORMAT); } public ReactionProjector(MolSearch searcher) { this(searcher, DEFAULT_MOLECULE_FORMAT); } public ReactionProjector(String moleculeFormat) { this(DEFAULT_SEARCHER, moleculeFormat); } public ReactionProjector(MolSearch searcher, String moleculeFormat) { this.searcher = searcher; this.moleculeFormat = moleculeFormat; } public ReactionProjector(boolean useSubstructureFiltering) { // I don't anticipate either of these ever being used if this is the constructor employed. this.searcher = DEFAULT_SEARCHER; this.moleculeFormat = DEFAULT_MOLECULE_FORMAT; this.useSubstructureFiltering = useSubstructureFiltering; } /** * This method should be called if projecting on more chemicals than should be stored in memory * simultaneously. Until this is called, all chemicals acted on by the projector will be cached along * with their inchis. */ public void clearInchiCache() { MoleculeExporter.clearCache(); } /** * Run the given reactor until it produces the expected product. Reactor must produce one product at a time. * * @param reactor The reactor to run. * @param expectedProduct The product we expect to see. * @return The produced product; this is necessary because the reactor will produce the product with atom maps * corresponding to the substrate, whereas the expectedProduct Molecule will not have such atom maps. * @throws ReactionException If the expected product is not produced at all by the Reactor. * @throws IOException */ public Molecule reactUntilProducesProduct(Reactor reactor, Molecule expectedProduct) throws ReactionException { if (reactor.getProductCount() != 1) { throw new IllegalArgumentException("Reactor must produce exactly one product."); } Molecule[] products; while ((products = reactor.react()) != null) { if (testEquality(products[0], expectedProduct)) { return products[0]; } } throw new ReactionException("Expected product not among Reactor's predictions."); } /** * Tests equality of molecules based on two substructure queries. This is desirable because it makes the * equality comparison very configurable as compared to comparing inchis. * * @param A One molecule. * @param B Another molecule. * @return True if they are equivalent. */ private boolean testEquality(Molecule A, Molecule B) { return substructureTest(A, B) && substructureTest(B, A); } private boolean substructureTest(Molecule substructure, Molecule superstructure) { searcher.setQuery(substructure); searcher.setTarget(superstructure); try { return searcher.findFirst() != null; } catch (SearchException e) { // Log error but don't propagate upward. Have never seen this before. LOGGER.error("Error on ReactionProjector.substructureTest(), %s", e.getMessage()); return false; } } /** * Get the results of a reaction in list form, rather than as a map from substrates to products. * * @param mols The substrates. * @param reactor The reactor. * @param maxProducts The maximum number of products to compute before exiting. No limit is imposed for max values * of null or 0. * @return A list of product sets produced by this reaction. * @throws IOException * @throws ReactionException */ public List<Molecule[]> getAllProjectedProductSets(Molecule[] mols, Reactor reactor, Integer maxProducts) throws IOException, ReactionException { Map<Molecule[], List<Molecule[]>> map = getRoProjectionMap(mols, reactor, maxProducts); List<Molecule[]> allProductSets = new ArrayList<>(); for (Map.Entry<Molecule[], List<Molecule[]>> entry : map.entrySet()) { allProductSets.addAll(entry.getValue()); } return allProductSets; } public List<Molecule[]> getAllProjectedProductSets(Molecule[] mols, Reactor reactor) throws IOException, ReactionException { return getAllProjectedProductSets(mols, reactor, null); } /** * This function takes as input an array of molecules and a Reactor and outputs the products of the transformation. * The results are returned as a map from orderings of the substrates to the products produced by those orderings. * In most cases the map will have only one entry, but in some cases different orderings of substrates can lead to * different valid predictions. * <p> * Note that, for efficient two substrate expansion, the specialized method * fastProjectionOfTwoSubstrateRoOntoTwoMolecules should be used instead of this one. * * @param mols An array of molecules representing the chemical reactants. * @param reactor A Reactor representing the reaction to apply. * @param maxProducts The maximum number of products to compute before exiting. No limit is imposed for max values * of null or 0. * @return The substrate -> product map. */ public Map<Molecule[], List<Molecule[]>> getRoProjectionMap(Molecule[] mols, Reactor reactor, Integer maxProducts) throws ReactionException, IOException { Map<Molecule[], List<Molecule[]>> resultsMap = new HashMap<>(); if (mols.length != reactor.getReactantCount()) { LOGGER.debug("Tried to project RO with %d substrates on set of %d substrates", reactor.getReactantCount(), mols.length); return resultsMap; } // If there is only one reactant, we can do just a simple reaction computation. However, if we have multiple reactants, // we have to use the ConcurrentReactorProcessor API since it gives us the ability to combinatorially explore all // possible matching combinations of reactants on the substrates of the RO. if (mols.length == 1) { List<Molecule[]> productSets = getProductsFixedOrder(reactor, mols, maxProducts); if (!productSets.isEmpty()) { resultsMap.put(mols, productSets); } } else if (useSubstructureFiltering) { Optional<List<Set<Integer>>> viableSubstrateIndexes; try { viableSubstrateIndexes = matchCandidatesToSubstrateStructures(reactor, mols); } catch (SearchException e) { throw new ReactionException("Caught exception when performing pre-reaction substructure search", e); } if (viableSubstrateIndexes.isPresent()) { List<Integer> allIndexes = new ArrayList<Integer>(mols.length) {{ for (int i = 0; i < mols.length; i++) { add(i); } }}; int permutationIndex = 0; PermutationIterator<Integer> iter = new PermutationIterator<>(allIndexes); while (iter.hasNext()) { permutationIndex++; List<Integer> permutation = iter.next(); if (permutationFitsSubstructureMatches(permutation, viableSubstrateIndexes.get())) { Molecule[] substrates = indexPermutationToMolecules(mols, permutation); reactor.setReactants(substrates); Molecule[] products; List<Molecule[]> results = new ArrayList<>(); while ((products = reactor.react()) != null) { results.add(products); if (maxProducts != null && maxProducts > 0 && results.size() >= maxProducts) { break; } } if (results.size() > 0) { resultsMap.put(substrates, results); } } } } // Otherwise just return the empty map. } else { // TODO: why not make one of these per ReactionProjector object? // TODO: replace this with Apache commons PermutationIterator for clean iteration over distinct permutations. ConcurrentReactorProcessor reactorProcessor = new ConcurrentReactorProcessor(); reactorProcessor.setReactor(reactor); // This step is needed by ConcurrentReactorProcessor for the combinatorial exploration. // The iterator takes in the same substrates in each iteration step to build a matrix of combinations. // For example, if we have A + B -> C, the iterator creates this array: [[A,B], [A,B]]. Based on this, // ConcurrentReactorProcessor will cross the elements in the first index with the second, creating the combinations // A+A, A+B, B+A, B+B and operates all of those on the RO, which is what is desired. MoleculeIterator[] iterator = new MoleculeIterator[mols.length]; for (int i = 0; i < mols.length; i++) { iterator[i] = MoleculeIteratorFactory.createMoleculeIterator(mols); } reactorProcessor.setReactantIterators(iterator, ConcurrentReactorProcessor.MODE_COMBINATORIAL); // Bag is a multi-set class that ships with Apache commons collection, and we already use many commons libs--easy! Bag<Molecule> originalReactantsSet = new HashBag<>(Arrays.asList(mols)); // This set keeps track of substrate combinations we've used, and avoids repeats. Repeats can occur // when several substrates are identical, and can be put in "different" but symmetric orderings. Set<String> substrateHashes = new HashSet<>(); List<Molecule[]> results = null; int reactantCombination = 0; while ((results = reactorProcessor.reactNext()) != null) { reactantCombination++; Molecule[] reactants = reactorProcessor.getReactants(); if (results.isEmpty()) { LOGGER.debug("No results found for reactants combination %d, skipping", reactantCombination); continue; } Bag<Molecule> thisReactantSet = new HashBag<>(Arrays.asList(reactants)); if (!originalReactantsSet.equals(thisReactantSet)) { LOGGER.debug("Reactant set %d does not represent original, complete reactant sets, skipping", reactantCombination); continue; } String thisHash = getStringHash(reactants); if (substrateHashes.contains(thisHash)) { continue; } substrateHashes.add(thisHash); resultsMap.put(reactants, results); if (maxProducts != null && maxProducts > 0 && resultsMap.size() >= maxProducts) { break; } } } return resultsMap; } private Optional<List<Set<Integer>>> matchCandidatesToSubstrateStructures(Reactor reactor, Molecule[] candidates) throws SearchException, MolExportException { RxnMolecule rxnMol = reactor.getReaction(); Molecule[] substrateStructures = rxnMol.getReactants(); MolSearch search = substrateSearcher.get(); /* Each list in `results` is a set of viable indexes into candidates for a given substrate position in the reactor. * So if candidates contains three molecules and results looks like: * 0: 0, 1 * 1: 1 * 2: 0, 2 * * Then candidates[0] can appear as substrate 0 or 2, candidates[1] as substrate 0 or 1, and candidates[2] only as * substrate 2. */ List<Set<Integer>> results = new ArrayList<>(substrateStructures.length); for (int i = 0; i < substrateStructures.length; i++) { search.setQuery(substrateStructures[i]); results.add(new HashSet<>()); /* Test each candidate molecule against each substrate structure of the reactor. If we can't find a match, * there's no way that the molecule could appear in a particular position in the reaction and we can later * eliminate input molecule permutations that don't conform to this structure match. */ for (int j = 0; j < candidates.length; j++) { search.setTarget(candidates[j]); // Any match will do, so just call findFirst and ensure we get some sort of results. int[] res = search.findFirst(); if (res != null) { results.get(i).add(j); } } /* If one of the substrate patterns can't be matched to any input structure, then there are no viable permutations * to consider. We tell the call as much by returning `empty`. */ if (results.get(i).size() == 0) { return Optional.empty(); } } return Optional.of(results); } private Boolean permutationFitsSubstructureMatches( List<Integer> permutation, List<Set<Integer>> viableIndexes) { /* For each permutation, match its value at each location against the list of viable indexes for that location. * If a given permutation is 0, 2, 1 and viableIndexes looks like * 0: 0, 1 * 1: 1 * 2: 0, 2 * as above, then then the second structure index doesn't pass a substructure search and we can skip it. */ for (int i = 0; i < permutation.size(); i++) { if (!viableIndexes.get(i).contains(permutation.get(i))) { return false; } } return true; } private Molecule[] indexPermutationToMolecules(Molecule[] molecules, List<Integer> permutation) { Molecule[] results = new Molecule[molecules.length]; for (int i = 0; i < permutation.size(); i++) { results[i] = molecules[permutation.get(i)]; } return results; } public Map<Molecule[], List<Molecule[]>> getRoProjectionMap(Molecule[] mols, Reactor reactor) throws ReactionException, IOException { return getRoProjectionMap(mols, reactor, null); } /** * This function projects all possible combinations of two input substrates onto a 2 substrate RO, then * cleans and returns the results of that projection. * TODO: Expand this class to handle 3 or 4 substrate reactions. * * @param mols Substrate molecules * @param reactor The two substrate reactor * @return A list of product arrays, where each array represents the products of a given reaction combination. * @throws ReactionException * @throws IOException */ public Map<Molecule[], List<Molecule[]>> fastProjectionOfTwoSubstrateRoOntoTwoMolecules(Molecule[] mols, Reactor reactor) throws ReactionException, IOException { Map<Molecule[], List<Molecule[]>> results = new HashMap<>(); Molecule[] firstCombinationOfSubstrates = new Molecule[]{mols[0], mols[1]}; List<Molecule[]> productSets = getProductsFixedOrder(reactor, firstCombinationOfSubstrates); if (!productSets.isEmpty()) { results.put(firstCombinationOfSubstrates, productSets); } // Second ordering is same if two molecules are equal. if (getMoleculeString(mols[0]).equals(getMoleculeString(mols[1]))) { return results; } Molecule[] secondCombinationOfSubstrates = new Molecule[]{mols[1], mols[0]}; productSets = getProductsFixedOrder(reactor, firstCombinationOfSubstrates); if (!productSets.isEmpty()) { results.put(secondCombinationOfSubstrates, productSets); } return results; } /** * Gets all product sets that a Reactor produces when applies to a given array of substrates, in only the order * that they are already in. * * @param reactor The Reactor. * @param substrates The substrates. * @param maxProducts The maximum number of products to compute before exiting. No limit is imposed for max values * of null or 0. * @return A list of product arrays returned by the Reactor. * @throws ReactionException */ public List<Molecule[]> getProductsFixedOrder(Reactor reactor, Molecule[] substrates, Integer maxProducts) throws ReactionException { reactor.setReactants(substrates); List<Molecule[]> results = new ArrayList<>(); Molecule[] products; while ((products = reactor.react()) != null) { results.add(products); if (maxProducts != null && maxProducts > 0 && results.size() >= maxProducts) { break; } } return results; } public List<Molecule[]> getProductsFixedOrder(Reactor reactor, Molecule[] substrates) throws ReactionException { return getProductsFixedOrder(reactor, substrates, null); } /** * Builds a string which will be identical for two molecule arrays which represent the same molecules * in the same order, and different otherwise. * * @param mols The array of molecules. * @return The string representation. * @throws IOException */ private String getStringHash(Molecule[] mols) throws IOException { StringBuilder builder = new StringBuilder(); for (Molecule molecule : mols) { builder.append(getMoleculeString(molecule)); } return builder.toString(); } /** * Gets a a string of the molecule. The exporter takes care of all caching so we don't need to worry about it * * @param molecule The molecule. * @return The molecule's string presentation in the format that this class declares.. * @throws IOException */ private String getMoleculeString(Molecule molecule) throws IOException { return MoleculeExporter.exportMolecule(molecule, MoleculeFormat$.MODULE$.getName(this.moleculeFormat)); } }