/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.Utils;
import chemaxon.marvin.io.MolExportException;
import chemaxon.reaction.ConcurrentReactorProcessor;
import chemaxon.reaction.ReactionException;
import chemaxon.reaction.Reactor;
import chemaxon.sss.SearchConstants;
import chemaxon.sss.search.MolSearch;
import chemaxon.sss.search.MolSearchOptions;
import chemaxon.sss.search.SearchException;
import chemaxon.struc.Molecule;
import chemaxon.struc.RxnMolecule;
import chemaxon.util.iterator.MoleculeIterator;
import chemaxon.util.iterator.MoleculeIteratorFactory;
import com.act.analysis.chemicals.molecules.MoleculeExporter;
import com.act.analysis.chemicals.molecules.MoleculeFormat;
import com.act.analysis.chemicals.molecules.MoleculeFormat$;
import org.apache.commons.collections4.Bag;
import org.apache.commons.collections4.bag.HashBag;
import org.apache.commons.collections4.iterators.PermutationIterator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
public class ReactionProjector {
private static final Logger LOGGER = LogManager.getFormatterLogger(ReactionProjector.class);
private static final String MOL_NOT_FOUND = "NOT_FOUND";
private static final MolSearchOptions LAX_SEARCH_OPTIONS = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
private static final MolSearch DEFAULT_SEARCHER = new MolSearch();
private static final String DEFAULT_MOLECULE_FORMAT = MoleculeFormat.noAuxInchi$.MODULE$.toString();
private final String moleculeFormat;
/**
* Set searcher to ignore stereo and bond type. This will allow us to optimistically match products so that we don't
* end up throwing out a reactor that produces the right compound in a slightly different form.
*/
static {
LAX_SEARCH_OPTIONS.setStereoSearchType(SearchConstants.STEREO_IGNORE);
LAX_SEARCH_OPTIONS.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4);
DEFAULT_SEARCHER.setSearchOptions(LAX_SEARCH_OPTIONS);
}
private transient ThreadLocal<MolSearch> substrateSearcher = new ThreadLocal<MolSearch>() {
@Override
protected MolSearch initialValue() {
MolSearch search = new MolSearch();
/* These parameters were identified by trial and error as reasonable defaults for doing substructure searches for
* reactor substrate matching. */
MolSearchOptions options = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
// This allows H's in RO strings to match implicit hydrogens in our target molecules.
options.setImplicitHMatching(SearchConstants.IMPLICIT_H_MATCHING_ENABLED);
/* This allows for vague bond matching in ring structures. From the Chemaxon Docs:
* In the query all single ring bonds are replaced by "single or aromatic" and all double ring bonds are
* replaced by "double or aromatic" prior to search.
* (https://www.chemaxon.com/jchem/doc/dev/java/api/chemaxon/sss/SearchConstants.html)
*
* This should allow us to handle aromatized molecules gracefully without handling non-ring single and double
* bonds ambiguously. */
options.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL2);
// Few if any of our ROs concern stereo chemistry, so we can just ignore it.
options.setStereoSearchType(SearchConstants.STEREO_IGNORE);
/* Chemaxon's tautomer handling is weird, as sometimes it picks a non-representative tautomer as its default.
* As such, we'll allow tautomer matches to avoid excluding viable candidates. */
options.setTautomerSearch(SearchConstants.TAUTOMER_SEARCH_ON);
search.setSearchOptions(options);
return search;
}
};
private boolean useSubstructureFiltering = false;
private MolSearch searcher;
public ReactionProjector() {
this(DEFAULT_SEARCHER, DEFAULT_MOLECULE_FORMAT);
}
public ReactionProjector(MolSearch searcher) {
this(searcher, DEFAULT_MOLECULE_FORMAT);
}
public ReactionProjector(String moleculeFormat) {
this(DEFAULT_SEARCHER, moleculeFormat);
}
public ReactionProjector(MolSearch searcher, String moleculeFormat) {
this.searcher = searcher;
this.moleculeFormat = moleculeFormat;
}
public ReactionProjector(boolean useSubstructureFiltering) {
// I don't anticipate either of these ever being used if this is the constructor employed.
this.searcher = DEFAULT_SEARCHER;
this.moleculeFormat = DEFAULT_MOLECULE_FORMAT;
this.useSubstructureFiltering = useSubstructureFiltering;
}
/**
* This method should be called if projecting on more chemicals than should be stored in memory
* simultaneously. Until this is called, all chemicals acted on by the projector will be cached along
* with their inchis.
*/
public void clearInchiCache() {
MoleculeExporter.clearCache();
}
/**
* Run the given reactor until it produces the expected product. Reactor must produce one product at a time.
*
* @param reactor The reactor to run.
* @param expectedProduct The product we expect to see.
* @return The produced product; this is necessary because the reactor will produce the product with atom maps
* corresponding to the substrate, whereas the expectedProduct Molecule will not have such atom maps.
* @throws ReactionException If the expected product is not produced at all by the Reactor.
* @throws IOException
*/
public Molecule reactUntilProducesProduct(Reactor reactor, Molecule expectedProduct)
throws ReactionException {
if (reactor.getProductCount() != 1) {
throw new IllegalArgumentException("Reactor must produce exactly one product.");
}
Molecule[] products;
while ((products = reactor.react()) != null) {
if (testEquality(products[0], expectedProduct)) {
return products[0];
}
}
throw new ReactionException("Expected product not among Reactor's predictions.");
}
/**
* Tests equality of molecules based on two substructure queries. This is desirable because it makes the
* equality comparison very configurable as compared to comparing inchis.
*
* @param A One molecule.
* @param B Another molecule.
* @return True if they are equivalent.
*/
private boolean testEquality(Molecule A, Molecule B) {
return substructureTest(A, B) && substructureTest(B, A);
}
private boolean substructureTest(Molecule substructure, Molecule superstructure) {
searcher.setQuery(substructure);
searcher.setTarget(superstructure);
try {
return searcher.findFirst() != null;
} catch (SearchException e) {
// Log error but don't propagate upward. Have never seen this before.
LOGGER.error("Error on ReactionProjector.substructureTest(), %s", e.getMessage());
return false;
}
}
/**
* Get the results of a reaction in list form, rather than as a map from substrates to products.
*
* @param mols The substrates.
* @param reactor The reactor.
* @param maxProducts The maximum number of products to compute before exiting. No limit is imposed for max values
* of null or 0.
* @return A list of product sets produced by this reaction.
* @throws IOException
* @throws ReactionException
*/
public List<Molecule[]> getAllProjectedProductSets(Molecule[] mols, Reactor reactor, Integer maxProducts)
throws IOException, ReactionException {
Map<Molecule[], List<Molecule[]>> map = getRoProjectionMap(mols, reactor, maxProducts);
List<Molecule[]> allProductSets = new ArrayList<>();
for (Map.Entry<Molecule[], List<Molecule[]>> entry : map.entrySet()) {
allProductSets.addAll(entry.getValue());
}
return allProductSets;
}
public List<Molecule[]> getAllProjectedProductSets(Molecule[] mols, Reactor reactor)
throws IOException, ReactionException {
return getAllProjectedProductSets(mols, reactor, null);
}
/**
* This function takes as input an array of molecules and a Reactor and outputs the products of the transformation.
* The results are returned as a map from orderings of the substrates to the products produced by those orderings.
* In most cases the map will have only one entry, but in some cases different orderings of substrates can lead to
* different valid predictions.
* <p>
* Note that, for efficient two substrate expansion, the specialized method
* fastProjectionOfTwoSubstrateRoOntoTwoMolecules should be used instead of this one.
*
* @param mols An array of molecules representing the chemical reactants.
* @param reactor A Reactor representing the reaction to apply.
* @param maxProducts The maximum number of products to compute before exiting. No limit is imposed for max values
* of null or 0.
* @return The substrate -> product map.
*/
public Map<Molecule[], List<Molecule[]>> getRoProjectionMap(Molecule[] mols, Reactor reactor, Integer maxProducts)
throws ReactionException, IOException {
Map<Molecule[], List<Molecule[]>> resultsMap = new HashMap<>();
if (mols.length != reactor.getReactantCount()) {
LOGGER.debug("Tried to project RO with %d substrates on set of %d substrates",
reactor.getReactantCount(),
mols.length);
return resultsMap;
}
// If there is only one reactant, we can do just a simple reaction computation. However, if we have multiple reactants,
// we have to use the ConcurrentReactorProcessor API since it gives us the ability to combinatorially explore all
// possible matching combinations of reactants on the substrates of the RO.
if (mols.length == 1) {
List<Molecule[]> productSets = getProductsFixedOrder(reactor, mols, maxProducts);
if (!productSets.isEmpty()) {
resultsMap.put(mols, productSets);
}
} else if (useSubstructureFiltering) {
Optional<List<Set<Integer>>> viableSubstrateIndexes;
try {
viableSubstrateIndexes = matchCandidatesToSubstrateStructures(reactor, mols);
} catch (SearchException e) {
throw new ReactionException("Caught exception when performing pre-reaction substructure search", e);
}
if (viableSubstrateIndexes.isPresent()) {
List<Integer> allIndexes = new ArrayList<Integer>(mols.length) {{
for (int i = 0; i < mols.length; i++) {
add(i);
}
}};
int permutationIndex = 0;
PermutationIterator<Integer> iter = new PermutationIterator<>(allIndexes);
while (iter.hasNext()) {
permutationIndex++;
List<Integer> permutation = iter.next();
if (permutationFitsSubstructureMatches(permutation, viableSubstrateIndexes.get())) {
Molecule[] substrates = indexPermutationToMolecules(mols, permutation);
reactor.setReactants(substrates);
Molecule[] products;
List<Molecule[]> results = new ArrayList<>();
while ((products = reactor.react()) != null) {
results.add(products);
if (maxProducts != null && maxProducts > 0 && results.size() >= maxProducts) {
break;
}
}
if (results.size() > 0) {
resultsMap.put(substrates, results);
}
}
}
} // Otherwise just return the empty map.
} else {
// TODO: why not make one of these per ReactionProjector object?
// TODO: replace this with Apache commons PermutationIterator for clean iteration over distinct permutations.
ConcurrentReactorProcessor reactorProcessor = new ConcurrentReactorProcessor();
reactorProcessor.setReactor(reactor);
// This step is needed by ConcurrentReactorProcessor for the combinatorial exploration.
// The iterator takes in the same substrates in each iteration step to build a matrix of combinations.
// For example, if we have A + B -> C, the iterator creates this array: [[A,B], [A,B]]. Based on this,
// ConcurrentReactorProcessor will cross the elements in the first index with the second, creating the combinations
// A+A, A+B, B+A, B+B and operates all of those on the RO, which is what is desired.
MoleculeIterator[] iterator = new MoleculeIterator[mols.length];
for (int i = 0; i < mols.length; i++) {
iterator[i] = MoleculeIteratorFactory.createMoleculeIterator(mols);
}
reactorProcessor.setReactantIterators(iterator, ConcurrentReactorProcessor.MODE_COMBINATORIAL);
// Bag is a multi-set class that ships with Apache commons collection, and we already use many commons libs--easy!
Bag<Molecule> originalReactantsSet = new HashBag<>(Arrays.asList(mols));
// This set keeps track of substrate combinations we've used, and avoids repeats. Repeats can occur
// when several substrates are identical, and can be put in "different" but symmetric orderings.
Set<String> substrateHashes = new HashSet<>();
List<Molecule[]> results = null;
int reactantCombination = 0;
while ((results = reactorProcessor.reactNext()) != null) {
reactantCombination++;
Molecule[] reactants = reactorProcessor.getReactants();
if (results.isEmpty()) {
LOGGER.debug("No results found for reactants combination %d, skipping", reactantCombination);
continue;
}
Bag<Molecule> thisReactantSet = new HashBag<>(Arrays.asList(reactants));
if (!originalReactantsSet.equals(thisReactantSet)) {
LOGGER.debug("Reactant set %d does not represent original, complete reactant sets, skipping",
reactantCombination);
continue;
}
String thisHash = getStringHash(reactants);
if (substrateHashes.contains(thisHash)) {
continue;
}
substrateHashes.add(thisHash);
resultsMap.put(reactants, results);
if (maxProducts != null && maxProducts > 0 && resultsMap.size() >= maxProducts) {
break;
}
}
}
return resultsMap;
}
private Optional<List<Set<Integer>>> matchCandidatesToSubstrateStructures(Reactor reactor, Molecule[] candidates)
throws SearchException, MolExportException {
RxnMolecule rxnMol = reactor.getReaction();
Molecule[] substrateStructures = rxnMol.getReactants();
MolSearch search = substrateSearcher.get();
/* Each list in `results` is a set of viable indexes into candidates for a given substrate position in the reactor.
* So if candidates contains three molecules and results looks like:
* 0: 0, 1
* 1: 1
* 2: 0, 2
*
* Then candidates[0] can appear as substrate 0 or 2, candidates[1] as substrate 0 or 1, and candidates[2] only as
* substrate 2. */
List<Set<Integer>> results = new ArrayList<>(substrateStructures.length);
for (int i = 0; i < substrateStructures.length; i++) {
search.setQuery(substrateStructures[i]);
results.add(new HashSet<>());
/* Test each candidate molecule against each substrate structure of the reactor. If we can't find a match,
* there's no way that the molecule could appear in a particular position in the reaction and we can later
* eliminate input molecule permutations that don't conform to this structure match. */
for (int j = 0; j < candidates.length; j++) {
search.setTarget(candidates[j]);
// Any match will do, so just call findFirst and ensure we get some sort of results.
int[] res = search.findFirst();
if (res != null) {
results.get(i).add(j);
}
}
/* If one of the substrate patterns can't be matched to any input structure, then there are no viable permutations
* to consider. We tell the call as much by returning `empty`. */
if (results.get(i).size() == 0) {
return Optional.empty();
}
}
return Optional.of(results);
}
private Boolean permutationFitsSubstructureMatches(
List<Integer> permutation, List<Set<Integer>> viableIndexes) {
/* For each permutation, match its value at each location against the list of viable indexes for that location.
* If a given permutation is 0, 2, 1 and viableIndexes looks like
* 0: 0, 1
* 1: 1
* 2: 0, 2
* as above, then then the second structure index doesn't pass a substructure search and we can skip it. */
for (int i = 0; i < permutation.size(); i++) {
if (!viableIndexes.get(i).contains(permutation.get(i))) {
return false;
}
}
return true;
}
private Molecule[] indexPermutationToMolecules(Molecule[] molecules, List<Integer> permutation) {
Molecule[] results = new Molecule[molecules.length];
for (int i = 0; i < permutation.size(); i++) {
results[i] = molecules[permutation.get(i)];
}
return results;
}
public Map<Molecule[], List<Molecule[]>> getRoProjectionMap(Molecule[] mols, Reactor reactor)
throws ReactionException, IOException {
return getRoProjectionMap(mols, reactor, null);
}
/**
* This function projects all possible combinations of two input substrates onto a 2 substrate RO, then
* cleans and returns the results of that projection.
* TODO: Expand this class to handle 3 or 4 substrate reactions.
*
* @param mols Substrate molecules
* @param reactor The two substrate reactor
* @return A list of product arrays, where each array represents the products of a given reaction combination.
* @throws ReactionException
* @throws IOException
*/
public Map<Molecule[], List<Molecule[]>> fastProjectionOfTwoSubstrateRoOntoTwoMolecules(Molecule[] mols, Reactor reactor)
throws ReactionException, IOException {
Map<Molecule[], List<Molecule[]>> results = new HashMap<>();
Molecule[] firstCombinationOfSubstrates = new Molecule[]{mols[0], mols[1]};
List<Molecule[]> productSets = getProductsFixedOrder(reactor, firstCombinationOfSubstrates);
if (!productSets.isEmpty()) {
results.put(firstCombinationOfSubstrates, productSets);
}
// Second ordering is same if two molecules are equal.
if (getMoleculeString(mols[0]).equals(getMoleculeString(mols[1]))) {
return results;
}
Molecule[] secondCombinationOfSubstrates = new Molecule[]{mols[1], mols[0]};
productSets = getProductsFixedOrder(reactor, firstCombinationOfSubstrates);
if (!productSets.isEmpty()) {
results.put(secondCombinationOfSubstrates, productSets);
}
return results;
}
/**
* Gets all product sets that a Reactor produces when applies to a given array of substrates, in only the order
* that they are already in.
*
* @param reactor The Reactor.
* @param substrates The substrates.
* @param maxProducts The maximum number of products to compute before exiting. No limit is imposed for max values
* of null or 0.
* @return A list of product arrays returned by the Reactor.
* @throws ReactionException
*/
public List<Molecule[]> getProductsFixedOrder(Reactor reactor, Molecule[] substrates, Integer maxProducts)
throws ReactionException {
reactor.setReactants(substrates);
List<Molecule[]> results = new ArrayList<>();
Molecule[] products;
while ((products = reactor.react()) != null) {
results.add(products);
if (maxProducts != null && maxProducts > 0 && results.size() >= maxProducts) {
break;
}
}
return results;
}
public List<Molecule[]> getProductsFixedOrder(Reactor reactor, Molecule[] substrates)
throws ReactionException {
return getProductsFixedOrder(reactor, substrates, null);
}
/**
* Builds a string which will be identical for two molecule arrays which represent the same molecules
* in the same order, and different otherwise.
*
* @param mols The array of molecules.
* @return The string representation.
* @throws IOException
*/
private String getStringHash(Molecule[] mols) throws IOException {
StringBuilder builder = new StringBuilder();
for (Molecule molecule : mols) {
builder.append(getMoleculeString(molecule));
}
return builder.toString();
}
/**
* Gets a a string of the molecule. The exporter takes care of all caching so we don't need to worry about it
*
* @param molecule The molecule.
* @return The molecule's string presentation in the format that this class declares..
* @throws IOException
*/
private String getMoleculeString(Molecule molecule) throws IOException {
return MoleculeExporter.exportMolecule(molecule, MoleculeFormat$.MODULE$.getName(this.moleculeFormat));
}
}