ExpandedReactionSearcher.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.sars;

import chemaxon.calculations.hydrogenize.Hydrogenize;
import chemaxon.reaction.AtomIdentifier;
import chemaxon.reaction.ReactionException;
import chemaxon.reaction.Reactor;
import chemaxon.sss.SearchConstants;
import chemaxon.sss.search.MolSearch;
import chemaxon.sss.search.MolSearchOptions;
import chemaxon.sss.search.SearchException;
import chemaxon.sss.search.SearchHit;
import chemaxon.struc.MolAtom;
import chemaxon.struc.Molecule;
import chemaxon.struc.RxnMolecule;
import com.act.biointerpretation.Utils.ReactionProjector;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

public class ExpandedReactionSearcher {

  private static final Logger LOGGER = LogManager.getFormatterLogger(ExpandedReactionSearcher.class);

  private static final MolSearchOptions LAX_SEARCH_OPTIONS = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
  private static final MolSearch DEFAULT_SEARCHER = new MolSearch();

  static {
    LAX_SEARCH_OPTIONS.setStereoSearchType(SearchConstants.STEREO_IGNORE);
    LAX_SEARCH_OPTIONS.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4);
    DEFAULT_SEARCHER.setSearchOptions(LAX_SEARCH_OPTIONS);
  }

  private static final Hydrogenize HYDROGENIZER = new Hydrogenize();

  private final ReactionProjector projector;
  private final MolSearch searcher;

  // This value should always store the smallest value that has not thus far been used as atom map value.
  private int nextLabel;

  private Reactor seedReactor;
  private Molecule substrate;
  private Molecule expectedProduct;
  private Molecule substructure;

  private Molecule predictedProduct;
  private Iterator<Molecule> fragmentPointer;
  private Molecule currentFrag;
  private SearchHit currentHit;

  public ExpandedReactionSearcher(ReactionProjector projector, MolSearch searcher) {
    this.projector = projector;
    this.searcher = searcher;
  }

  public ExpandedReactionSearcher(ReactionProjector projector) {
    this.projector = projector;
    this.searcher = DEFAULT_SEARCHER;
  }

  /**
   * Initializes the searcher for a search by projecting the given Reactor on its substrate until it produces
   * the correct product, and resetting the fields for tracking the progress of the iterator. Must be called
   * before calling getNextReactor
   *
   * @param seed The seed Reactor.
   * @param sub The substrate.
   * @param prod The expected product.
   * @param substruct The MCS, from a set of reactions containing this one.
   * @throws ReactionException
   * @throws SearchException
   */
  public void initSearch(Reactor seed, Molecule sub, Molecule prod, Molecule substruct) throws ReactionException, SearchException {
    this.seedReactor = seed;
    this.substrate = sub;
    this.expectedProduct = prod;
    this.substructure = substruct;
    // Ensure that the resulting Reactor will include explicit hydrogens from RO
    HYDROGENIZER.convertImplicitHToExplicit(substrate);
    // Label the molecules of the substrate so we can correspond them to product and substructure molecules
    nextLabel = 1;
    labelMolecule(substrate);
    // After the next section, predictedProduct is an equivalent molecule to expectedProduct, but its atom map labels
    // match those of the substrate, which facilitates later computations.
    try {
      seedReactor.setReactants(new Molecule[] {substrate});
    } catch (ReactionException e) {
      LOGGER.info("Failed to setReactants. %s", e.getMessage());
      throw e;
    }
    try {
      predictedProduct = projector.reactUntilProducesProduct(seedReactor, expectedProduct);
    } catch (ReactionException e) {
      LOGGER.warn("Validation RO doesn't take substrate to expectedProduct: %s", e.getMessage());
      throw e;
    }
    // convertToFrags() destroys the molecule, so this cloning is necessary. The clone method is from chemaxon and
    // produces a proper, deep copy of the molecule.
    Molecule substructureCopy = substructure.clone();
    fragmentPointer = Arrays.asList(substructureCopy.convertToFrags()).iterator();
    getNextFrag();
    getNextSearchHit();
  }

  /**
   * Gets the next possible expansion of the seed reactor according to the given substructure.
   * Returns null if there are no more possible reactors.
   *
   * @return The Reactor.
   */
  public Reactor getNextReactor() {

    while (currentFrag != null) {
      while (currentHit != null) {
        try {
          Reactor fullReactor = getExpandedReaction(currentHit);
          getNextSearchHit();
          if (fullReactor != null) {
            return fullReactor;
          }
        } catch (ReactionException e) {
        }
        getNextSearchHit();
      }
      getNextFrag();
    }

    return null;
  }

  /**
   * Get the next fragment of the substructure, and initialize the MolSearch searcher to look for that
   * fragment in the substrate. Modifies the instance variable currentFrag to its new value if successful,
   * or to null if unsuccessful.
   */
  private void getNextFrag() {
    if (!fragmentPointer.hasNext()) {
      currentFrag = null;
      return;
    }
    currentFrag = fragmentPointer.next();
    searcher.setQuery(currentFrag);
    searcher.setTarget(substrate);
  }

  /**
   * Get the next search hit from the current searcher.  Modifies the instance variable currentHit to its new value
   * if successful, or to null if unsuccessful.
   */
  private void getNextSearchHit() {
    try {
      currentHit = searcher.findNextHit();
    } catch (SearchException e) {
      currentHit = null;
    }
  }

  /**
   * Gets the expanded reactor corresponding to a particular search hit against a substructure fragment. Returns
   * null if no valid expansion is found.
   *
   * @param hit The SearchHit.
   * @return The Reactor.
   * @throws ReactionException
   */
  private Reactor getExpandedReaction(SearchHit hit) throws ReactionException {
    Set<Integer> relevantAtomMaps = getRelevantSubstrateAtomMaps(substrate, hit, seedReactor);

    if (relevantAtomMaps.isEmpty()) {
      return null;
    }

    relevantAtomMaps.addAll(labelNewAtomsAndReturnAtomMaps(predictedProduct));

    // Copy molecules and then destructively remove portions that don't match the RO or substructure
    Molecule substrateCopy = substrate.clone();
    Molecule productCopy = predictedProduct.clone();
    retainRelevantAtoms(substrateCopy, relevantAtomMaps);
    retainRelevantAtoms(productCopy, relevantAtomMaps);

    try {
      return getFullReactor(substrateCopy, productCopy);
    } catch (ReactionException e) {
      LOGGER.warn("Failed to getFullReactor from final substrate and expectedProduct. %s", e.getMessage());
      throw e;
    }
  }

  /**
   * Label the given molecule with new atom map label values.
   *
   * @param mol The molecule to label.
   */
  private void labelMolecule(Molecule mol) {
    for (MolAtom atom : mol.getAtomArray()) {
      if (atom.getAtomMap() == 0) {
        atom.setAtomMap(nextLabel);
        nextLabel++;
      }
    }
  }

  /**
   * Get the atom maps of the substrate that are either in the substructure or affected by the seedReactor,
   * if there is overlap between the two sets. Return an empty set if there is no overlap, as this indicates that the
   * given search hit does not indicate any further constraint on where the RO can be applied.
   *
   * @param substrate The substrate of the reaction.
   * @param hit The search hit of the substructure in the substrate.
   * @param seedReactor The seed seedReactor from the validation corpus.
   * @return A full seedReactor incorporating the substructure and seed seedReactor, if one can be constructed.
   * @throws SearchException
   * @throws ReactionException
   */
  private Set<Integer> getRelevantSubstrateAtomMaps(Molecule substrate, SearchHit hit, Reactor seedReactor)
      throws ReactionException {
    Set<Integer> roAtomMaps = getSubstrateRoAtomMaps(substrate, seedReactor);
    Set<Integer> sarAtomMaps = getSarAtomMaps(substrate, hit);

    Set<Integer> overlap = new HashSet<>(sarAtomMaps);
    overlap.retainAll(roAtomMaps);

    // If the overlap is empty we don't want to build an extended RO- this indicates RO and SAR affect different
    // parts of the molecule.
    if (!overlap.isEmpty()) {
      roAtomMaps.addAll(sarAtomMaps);
      return roAtomMaps;
    }

    return Collections.emptySet();
  }

  /**
   * Get the atom map values of atoms in the substrate that are involved in the reaction. This corresponds to all
   * atoms which are explicitly encoded in the reactor. In practice, to extract these, we must first extract the
   * atoms that correspond to product atoms included in the Reactor, and then add on "virgin" substrate atoms that
   * disappear in the transformation from substrate to product.
   *
   * @param substrate The substrate.
   * @param reactor The seedReactor.
   * @return The set of atom maps that the seedReactor acts on.
   */
  private Set<Integer> getSubstrateRoAtomMaps(Molecule substrate, Reactor reactor) {
    Set<Integer> roAtomMaps = new HashSet<>();
    Set<Integer> substrateIndicesInProduct = new HashSet<>();

    /**
     * The reactionMap is a map whose keys are MolAtoms in the product, and whose values are AtomIdentifiers that tell
     * you about how those atoms relate to the reaction that produced the product.
     *
     * In this loop, we add all substrate atoms that are explicitly marked as active by the reactionMap.
     */
    Map<MolAtom, AtomIdentifier> reactionMap = reactor.getReactionMap();
    for (AtomIdentifier id : reactionMap.values()) { // Iterate over the AtomIdentifier values of the ReactionMap.
      // If this AtomIdentifier is an orphan, this product atom does not correspond to an atom of the substrate.
      if (id.isOrphanAtom()) {
        continue;
      }
      // At this point, we know we have an AtomIdentifier that corresponds to an atom that is in both the substrate
      // and the product, but we don't know if it's involved in the reaction, or just untouched by it.

      // Keep tabs on the corresponding atom for later, but don't yet conclude that it's relevant.
      Integer substrateAtomIndex = id.getAtomIndex(); // Index of the corresponding atom in the substrate's atom array
      substrateIndicesInProduct.add(substrateAtomIndex); // Save for later.

      // The reactionSchemaMap has to do with how the Reactor internally keeps track of the mapping between
      // substrate and product atoms, I believe these values are only a property of the Reactor, not of these particular
      // substrates and products. The key is that, to tell if a product atom is explicitly included in the Reactor, we
      // can simply test if its associated reactionSchemaMap value is nonzero. If it is, then the reactor does
      // explicitly encode that atom.
      if (id.getReactionSchemaMap() > 0) {
        roAtomMaps.add(substrate.getAtomArray()[substrateAtomIndex].getAtomMap());
      }
    }

    /**
     * Now add in all substrate atoms which occur nowhere in the product: these are also implicitly acted upon.
     * In theory we should be able to find these atoms with the built-in isVirgin() method, but I didn't figure how to
     * get that to work. So instead, we iterate over all substrate atoms and filter out those we already saw correspoded
     * to product atoms.
     *
     * How I thought this could work with isVirginAtom is:
     * if (atomIdentifier.isVirginAtom(new Molecule[]{predictedProduct})) { add corresponding atom map to set }
     * But this throws indexOutOfBoundsExceptions within the isVirginAtom call.
     */
    for (int i = 0; i < substrate.getAtomArray().length; i++) { // Iterate over indices in the substrate atom array.
      if (!substrateIndicesInProduct.contains(i)) { // Test if this atom was NOT seen in the product
        roAtomMaps.add(substrate.getAtomArray()[i].getAtomMap()); // If it was not, add its atom map to the result set.
      }
    }

    return roAtomMaps;
  }

  /**
   * Returns the atom map values corresponding to a given substructure hit.
   *
   * @param substrate The substrate.
   * @param hit The search hit in the substrate.
   * @return The substrate atom maps that match the next search hit.
   * @throws SearchException
   */
  private Set<Integer> getSarAtomMaps(Molecule substrate, SearchHit hit) {
    Set<Integer> sarAtomMaps = new HashSet<>();
    for (Integer atomId : hit.getSingleHit()) { // The values in the hit are indices into the substrate's atom array
      sarAtomMaps.add(substrate.getAtomArray()[atomId].getAtomMap());
    }
    return sarAtomMaps;
  }

  /**
   * Label every zero-labeled atom in the molecule with a new label, and return the newly labeled atoms' map values.
   *
   * @param product The molecule to label.
   * @return The atom maps of the labeled atoms.
   */
  private Set<Integer> labelNewAtomsAndReturnAtomMaps(Molecule product) {
    Set<Integer> result = new HashSet<>();
    for (MolAtom atom : product.getAtomArray()) {
      if (atom.getAtomMap() == 0) { // Atoms are zero-labeled only if absent in substrate - these are important!
        atom.setAtomMap(nextLabel);
        result.add(nextLabel);
        nextLabel++;
      }
    }
    return result;
  }

  /**
   * Remove all atoms that don't have atom maps in the relevantAtoms set from the molecule
   *
   * @param molecule The molecule to trim.
   * @param relevantAtoms The atom map values to keep.
   */
  private void retainRelevantAtoms(Molecule molecule, Set<Integer> relevantAtoms) {
    for (MolAtom atom : molecule.getAtomArray()) {
      if (!relevantAtoms.contains(atom.getAtomMap())) {
        molecule.removeAtom(atom);
      }
    }
  }

  /**
   * Build a seedReactor that takes the given substrate Molecules to the given Product molecule.
   *
   * @param finalSubstrate The substrate.
   * @param finalProduct The expectedProduct.
   * @return The Reactor.
   * @throws ReactionException If the reactor could not be built.
   */
  private Reactor getFullReactor(Molecule finalSubstrate, Molecule finalProduct) throws ReactionException {
    RxnMolecule rxnMolecule = new RxnMolecule();
    rxnMolecule.addComponent(finalSubstrate, RxnMolecule.REACTANTS);
    rxnMolecule.addComponent(finalProduct, RxnMolecule.PRODUCTS);

    Reactor fullReactor = new Reactor();
    fullReactor.setReaction(rxnMolecule);
    return fullReactor;
  }

}