/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.sars;
import chemaxon.calculations.hydrogenize.Hydrogenize;
import chemaxon.reaction.AtomIdentifier;
import chemaxon.reaction.ReactionException;
import chemaxon.reaction.Reactor;
import chemaxon.sss.SearchConstants;
import chemaxon.sss.search.MolSearch;
import chemaxon.sss.search.MolSearchOptions;
import chemaxon.sss.search.SearchException;
import chemaxon.sss.search.SearchHit;
import chemaxon.struc.MolAtom;
import chemaxon.struc.Molecule;
import chemaxon.struc.RxnMolecule;
import com.act.biointerpretation.Utils.ReactionProjector;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
public class ExpandedReactionSearcher {
private static final Logger LOGGER = LogManager.getFormatterLogger(ExpandedReactionSearcher.class);
private static final MolSearchOptions LAX_SEARCH_OPTIONS = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
private static final MolSearch DEFAULT_SEARCHER = new MolSearch();
static {
LAX_SEARCH_OPTIONS.setStereoSearchType(SearchConstants.STEREO_IGNORE);
LAX_SEARCH_OPTIONS.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4);
DEFAULT_SEARCHER.setSearchOptions(LAX_SEARCH_OPTIONS);
}
private static final Hydrogenize HYDROGENIZER = new Hydrogenize();
private final ReactionProjector projector;
private final MolSearch searcher;
// This value should always store the smallest value that has not thus far been used as atom map value.
private int nextLabel;
private Reactor seedReactor;
private Molecule substrate;
private Molecule expectedProduct;
private Molecule substructure;
private Molecule predictedProduct;
private Iterator<Molecule> fragmentPointer;
private Molecule currentFrag;
private SearchHit currentHit;
public ExpandedReactionSearcher(ReactionProjector projector, MolSearch searcher) {
this.projector = projector;
this.searcher = searcher;
}
public ExpandedReactionSearcher(ReactionProjector projector) {
this.projector = projector;
this.searcher = DEFAULT_SEARCHER;
}
/**
* Initializes the searcher for a search by projecting the given Reactor on its substrate until it produces
* the correct product, and resetting the fields for tracking the progress of the iterator. Must be called
* before calling getNextReactor
*
* @param seed The seed Reactor.
* @param sub The substrate.
* @param prod The expected product.
* @param substruct The MCS, from a set of reactions containing this one.
* @throws ReactionException
* @throws SearchException
*/
public void initSearch(Reactor seed, Molecule sub, Molecule prod, Molecule substruct) throws ReactionException, SearchException {
this.seedReactor = seed;
this.substrate = sub;
this.expectedProduct = prod;
this.substructure = substruct;
// Ensure that the resulting Reactor will include explicit hydrogens from RO
HYDROGENIZER.convertImplicitHToExplicit(substrate);
// Label the molecules of the substrate so we can correspond them to product and substructure molecules
nextLabel = 1;
labelMolecule(substrate);
// After the next section, predictedProduct is an equivalent molecule to expectedProduct, but its atom map labels
// match those of the substrate, which facilitates later computations.
try {
seedReactor.setReactants(new Molecule[] {substrate});
} catch (ReactionException e) {
LOGGER.info("Failed to setReactants. %s", e.getMessage());
throw e;
}
try {
predictedProduct = projector.reactUntilProducesProduct(seedReactor, expectedProduct);
} catch (ReactionException e) {
LOGGER.warn("Validation RO doesn't take substrate to expectedProduct: %s", e.getMessage());
throw e;
}
// convertToFrags() destroys the molecule, so this cloning is necessary. The clone method is from chemaxon and
// produces a proper, deep copy of the molecule.
Molecule substructureCopy = substructure.clone();
fragmentPointer = Arrays.asList(substructureCopy.convertToFrags()).iterator();
getNextFrag();
getNextSearchHit();
}
/**
* Gets the next possible expansion of the seed reactor according to the given substructure.
* Returns null if there are no more possible reactors.
*
* @return The Reactor.
*/
public Reactor getNextReactor() {
while (currentFrag != null) {
while (currentHit != null) {
try {
Reactor fullReactor = getExpandedReaction(currentHit);
getNextSearchHit();
if (fullReactor != null) {
return fullReactor;
}
} catch (ReactionException e) {
}
getNextSearchHit();
}
getNextFrag();
}
return null;
}
/**
* Get the next fragment of the substructure, and initialize the MolSearch searcher to look for that
* fragment in the substrate. Modifies the instance variable currentFrag to its new value if successful,
* or to null if unsuccessful.
*/
private void getNextFrag() {
if (!fragmentPointer.hasNext()) {
currentFrag = null;
return;
}
currentFrag = fragmentPointer.next();
searcher.setQuery(currentFrag);
searcher.setTarget(substrate);
}
/**
* Get the next search hit from the current searcher. Modifies the instance variable currentHit to its new value
* if successful, or to null if unsuccessful.
*/
private void getNextSearchHit() {
try {
currentHit = searcher.findNextHit();
} catch (SearchException e) {
currentHit = null;
}
}
/**
* Gets the expanded reactor corresponding to a particular search hit against a substructure fragment. Returns
* null if no valid expansion is found.
*
* @param hit The SearchHit.
* @return The Reactor.
* @throws ReactionException
*/
private Reactor getExpandedReaction(SearchHit hit) throws ReactionException {
Set<Integer> relevantAtomMaps = getRelevantSubstrateAtomMaps(substrate, hit, seedReactor);
if (relevantAtomMaps.isEmpty()) {
return null;
}
relevantAtomMaps.addAll(labelNewAtomsAndReturnAtomMaps(predictedProduct));
// Copy molecules and then destructively remove portions that don't match the RO or substructure
Molecule substrateCopy = substrate.clone();
Molecule productCopy = predictedProduct.clone();
retainRelevantAtoms(substrateCopy, relevantAtomMaps);
retainRelevantAtoms(productCopy, relevantAtomMaps);
try {
return getFullReactor(substrateCopy, productCopy);
} catch (ReactionException e) {
LOGGER.warn("Failed to getFullReactor from final substrate and expectedProduct. %s", e.getMessage());
throw e;
}
}
/**
* Label the given molecule with new atom map label values.
*
* @param mol The molecule to label.
*/
private void labelMolecule(Molecule mol) {
for (MolAtom atom : mol.getAtomArray()) {
if (atom.getAtomMap() == 0) {
atom.setAtomMap(nextLabel);
nextLabel++;
}
}
}
/**
* Get the atom maps of the substrate that are either in the substructure or affected by the seedReactor,
* if there is overlap between the two sets. Return an empty set if there is no overlap, as this indicates that the
* given search hit does not indicate any further constraint on where the RO can be applied.
*
* @param substrate The substrate of the reaction.
* @param hit The search hit of the substructure in the substrate.
* @param seedReactor The seed seedReactor from the validation corpus.
* @return A full seedReactor incorporating the substructure and seed seedReactor, if one can be constructed.
* @throws SearchException
* @throws ReactionException
*/
private Set<Integer> getRelevantSubstrateAtomMaps(Molecule substrate, SearchHit hit, Reactor seedReactor)
throws ReactionException {
Set<Integer> roAtomMaps = getSubstrateRoAtomMaps(substrate, seedReactor);
Set<Integer> sarAtomMaps = getSarAtomMaps(substrate, hit);
Set<Integer> overlap = new HashSet<>(sarAtomMaps);
overlap.retainAll(roAtomMaps);
// If the overlap is empty we don't want to build an extended RO- this indicates RO and SAR affect different
// parts of the molecule.
if (!overlap.isEmpty()) {
roAtomMaps.addAll(sarAtomMaps);
return roAtomMaps;
}
return Collections.emptySet();
}
/**
* Get the atom map values of atoms in the substrate that are involved in the reaction. This corresponds to all
* atoms which are explicitly encoded in the reactor. In practice, to extract these, we must first extract the
* atoms that correspond to product atoms included in the Reactor, and then add on "virgin" substrate atoms that
* disappear in the transformation from substrate to product.
*
* @param substrate The substrate.
* @param reactor The seedReactor.
* @return The set of atom maps that the seedReactor acts on.
*/
private Set<Integer> getSubstrateRoAtomMaps(Molecule substrate, Reactor reactor) {
Set<Integer> roAtomMaps = new HashSet<>();
Set<Integer> substrateIndicesInProduct = new HashSet<>();
/**
* The reactionMap is a map whose keys are MolAtoms in the product, and whose values are AtomIdentifiers that tell
* you about how those atoms relate to the reaction that produced the product.
*
* In this loop, we add all substrate atoms that are explicitly marked as active by the reactionMap.
*/
Map<MolAtom, AtomIdentifier> reactionMap = reactor.getReactionMap();
for (AtomIdentifier id : reactionMap.values()) { // Iterate over the AtomIdentifier values of the ReactionMap.
// If this AtomIdentifier is an orphan, this product atom does not correspond to an atom of the substrate.
if (id.isOrphanAtom()) {
continue;
}
// At this point, we know we have an AtomIdentifier that corresponds to an atom that is in both the substrate
// and the product, but we don't know if it's involved in the reaction, or just untouched by it.
// Keep tabs on the corresponding atom for later, but don't yet conclude that it's relevant.
Integer substrateAtomIndex = id.getAtomIndex(); // Index of the corresponding atom in the substrate's atom array
substrateIndicesInProduct.add(substrateAtomIndex); // Save for later.
// The reactionSchemaMap has to do with how the Reactor internally keeps track of the mapping between
// substrate and product atoms, I believe these values are only a property of the Reactor, not of these particular
// substrates and products. The key is that, to tell if a product atom is explicitly included in the Reactor, we
// can simply test if its associated reactionSchemaMap value is nonzero. If it is, then the reactor does
// explicitly encode that atom.
if (id.getReactionSchemaMap() > 0) {
roAtomMaps.add(substrate.getAtomArray()[substrateAtomIndex].getAtomMap());
}
}
/**
* Now add in all substrate atoms which occur nowhere in the product: these are also implicitly acted upon.
* In theory we should be able to find these atoms with the built-in isVirgin() method, but I didn't figure how to
* get that to work. So instead, we iterate over all substrate atoms and filter out those we already saw correspoded
* to product atoms.
*
* How I thought this could work with isVirginAtom is:
* if (atomIdentifier.isVirginAtom(new Molecule[]{predictedProduct})) { add corresponding atom map to set }
* But this throws indexOutOfBoundsExceptions within the isVirginAtom call.
*/
for (int i = 0; i < substrate.getAtomArray().length; i++) { // Iterate over indices in the substrate atom array.
if (!substrateIndicesInProduct.contains(i)) { // Test if this atom was NOT seen in the product
roAtomMaps.add(substrate.getAtomArray()[i].getAtomMap()); // If it was not, add its atom map to the result set.
}
}
return roAtomMaps;
}
/**
* Returns the atom map values corresponding to a given substructure hit.
*
* @param substrate The substrate.
* @param hit The search hit in the substrate.
* @return The substrate atom maps that match the next search hit.
* @throws SearchException
*/
private Set<Integer> getSarAtomMaps(Molecule substrate, SearchHit hit) {
Set<Integer> sarAtomMaps = new HashSet<>();
for (Integer atomId : hit.getSingleHit()) { // The values in the hit are indices into the substrate's atom array
sarAtomMaps.add(substrate.getAtomArray()[atomId].getAtomMap());
}
return sarAtomMaps;
}
/**
* Label every zero-labeled atom in the molecule with a new label, and return the newly labeled atoms' map values.
*
* @param product The molecule to label.
* @return The atom maps of the labeled atoms.
*/
private Set<Integer> labelNewAtomsAndReturnAtomMaps(Molecule product) {
Set<Integer> result = new HashSet<>();
for (MolAtom atom : product.getAtomArray()) {
if (atom.getAtomMap() == 0) { // Atoms are zero-labeled only if absent in substrate - these are important!
atom.setAtomMap(nextLabel);
result.add(nextLabel);
nextLabel++;
}
}
return result;
}
/**
* Remove all atoms that don't have atom maps in the relevantAtoms set from the molecule
*
* @param molecule The molecule to trim.
* @param relevantAtoms The atom map values to keep.
*/
private void retainRelevantAtoms(Molecule molecule, Set<Integer> relevantAtoms) {
for (MolAtom atom : molecule.getAtomArray()) {
if (!relevantAtoms.contains(atom.getAtomMap())) {
molecule.removeAtom(atom);
}
}
}
/**
* Build a seedReactor that takes the given substrate Molecules to the given Product molecule.
*
* @param finalSubstrate The substrate.
* @param finalProduct The expectedProduct.
* @return The Reactor.
* @throws ReactionException If the reactor could not be built.
*/
private Reactor getFullReactor(Molecule finalSubstrate, Molecule finalProduct) throws ReactionException {
RxnMolecule rxnMolecule = new RxnMolecule();
rxnMolecule.addComponent(finalSubstrate, RxnMolecule.REACTANTS);
rxnMolecule.addComponent(finalProduct, RxnMolecule.PRODUCTS);
Reactor fullReactor = new Reactor();
fullReactor.setReaction(rxnMolecule);
return fullReactor;
}
}