/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.mechanisminspection;
import act.server.NoSQLAPI;
import act.shared.Chemical;
import act.shared.Reaction;
import chemaxon.calculations.clean.Cleaner;
import chemaxon.formats.MolExporter;
import chemaxon.formats.MolImporter;
import chemaxon.license.LicenseManager;
import chemaxon.license.LicenseProcessingException;
import chemaxon.marvin.io.MolExportException;
import chemaxon.reaction.ReactionException;
import chemaxon.reaction.Reactor;
import chemaxon.struc.Molecule;
import chemaxon.struc.MoleculeGraph;
import com.act.biointerpretation.BiointerpretationProcessor;
import com.act.biointerpretation.Utils.ReactionProjector;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONObject;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
/**
* The mechanistic validator is used for evaluating whether a particular set of substrates and products represent a
* valid enzymatic reaction. It takes as input a DB of cofactor processed reactions and tries to match each reaction
* against a curated set of ROs. Depending on the quality of the match, it scores the RO-Reaction from a 0-5 score
* scale. The default score is always -1, in which case, the results of the mechanistic validation run is not written
* to the write DB. Else, the matched ROs will be packaged and written into the reaction in the write DB.
*/
public class MechanisticValidator extends BiointerpretationProcessor {
private static final Logger LOGGER = LogManager.getFormatterLogger(MechanisticValidator.class);
private static final String PROCESSOR_NAME = "Mechanistic Validator";
private static final String DB_PERFECT_CLASSIFICATION = "perfect";
private static final int TWO_DIMENSION = 2;
private ErosCorpus erosCorpus;
private Map<Ero, Reactor> reactors;
private BlacklistedInchisCorpus blacklistedInchisCorpus;
private ReactionProjector projector;
private int eroHitCounter = 0;
private int cacheHitCounter = 0;
private Map<Pair<Map<Long, Integer>, Map<Long, Integer>>, Pair<Long, TreeMap<Integer, List<Ero>>>> cachedEroResults =
new HashMap<>();
private enum ROScore {
PERFECT_SCORE(4),
MANUALLY_VALIDATED_SCORE(3),
MANUALLY_NOT_VERIFIED_SCORE(2),
MANUALLY_INVALIDATED_SCORE(0),
DEFAULT_MATCH_SCORE(1),
DEFAULT_UNMATCH_SCORE(-1);
private int score;
ROScore(int score) {
this.score = score;
}
public int getScore() {
return score;
}
}
// See https://docs.chemaxon.com/display/FF/InChi+and+InChiKey+export+options for MolExporter options.
public static final String MOL_EXPORTER_INCHI_OPTIONS_FOR_INCHI_COMPARISON = new StringBuilder("inchi:").
append("SNon").append(','). // Exclude stereo information.
append("AuxNone").append(','). // Don't write the AuxInfo block--it just gets in the way.
append("Woff").append(','). // Disable warnings. We'll catch any exceptions this produces, but don't care about warnings.
append("DoNotAddH"). // Don't add H according to usual valences: all H are explicit
toString();
@Override
public String getName() {
return PROCESSOR_NAME;
}
public MechanisticValidator(NoSQLAPI api) {
super(api);
}
public void init() throws IOException, ReactionException, LicenseProcessingException {
erosCorpus = new ErosCorpus();
erosCorpus.loadValidationCorpus();
blacklistedInchisCorpus = new BlacklistedInchisCorpus();
blacklistedInchisCorpus.loadCorpus();
projector = new ReactionProjector(true);
initReactors();
markInitialized();
}
private void initReactors(File licenseFile) throws IOException, LicenseProcessingException, ReactionException {
if (licenseFile != null) {
LicenseManager.setLicenseFile(licenseFile.getAbsolutePath());
}
reactors = new HashMap<>(erosCorpus.getRos().size());
for (Ero ro : erosCorpus.getRos()) {
try {
Reactor reactor = new Reactor();
reactor.setReactionString(ro.getRo());
reactors.put(ro, reactor);
} catch (java.lang.NoSuchFieldError e) {
// TODO: Investigate why so many ROs are failing at this point.
LOGGER.error("Ros is throwing a no such field error: %s", ro.getRo());
}
}
}
@Override
protected void afterProcessReactions() throws IOException, ReactionException {
super.afterProcessReactions();
LOGGER.info("Found %d reactions that matched at least one ERO", eroHitCounter);
LOGGER.info("Observed %d ERO projection cache hits based on substrates/products", cacheHitCounter);
}
@Override
protected Reaction runSpecializedReactionProcessing(Reaction rxn, Long newId) throws IOException {
return runEROsOnReaction(rxn, newId);
}
private Reaction runEROsOnReaction(Reaction rxn, Long newId) throws IOException {
projector.clearInchiCache(); // Mew reaction probably doesn't have repeat chemicals, and we don't want a huge cache
// Apply the EROs and save the results in the reaction object.
TreeMap<Integer, List<Ero>> scoreToListOfRos;
try {
/* api.writeToOutKnowledgeGraph doesn't update the id of the written reaction, so we have to pass it as a
* separate parameter. :( I would fix the MongoDB behavior, but don't know what that might break!!! */
scoreToListOfRos = findBestRosThatCorrectlyComputeTheReaction(rxn, newId);
} catch (IOException e) {
// Log some information about the culprit when validation fails.
LOGGER.error("Caught IOException when applying ROs to rxn %d): %s", newId, e.getMessage());
throw e;
}
if (scoreToListOfRos != null && scoreToListOfRos.size() > 0) {
JSONObject matchingEros = new JSONObject();
for (Map.Entry<Integer, List<Ero>> entry : scoreToListOfRos.entrySet()) {
for (Ero e : entry.getValue()) {
matchingEros.put(e.getId().toString(), entry.getKey().toString());
}
}
rxn.setMechanisticValidatorResult(matchingEros);
eroHitCounter++;
}
return rxn;
}
private TreeMap<Integer, List<Ero>> findBestRosThatCorrectlyComputeTheReaction(Reaction rxn, Long newRxnId)
throws IOException {
/* Look up any cached results and return immediately if they're available.
* Note: this only works while EROs ignore cofactors. If cofactors need to be involved, we should just remove this.
*/
Map<Long, Integer> substrateToCoefficientMap = new HashMap<>();
Map<Long, Integer> productToCoefficientMap = new HashMap<>();
for (Long id : rxn.getSubstrates()) {
substrateToCoefficientMap.put(id, rxn.getSubstrateCoefficient(id));
}
for (Long id : rxn.getProducts()) {
productToCoefficientMap.put(id, rxn.getSubstrateCoefficient(id));
}
{
Pair<Long, TreeMap<Integer, List<Ero>>> cachedResults =
cachedEroResults.get(Pair.of(substrateToCoefficientMap, productToCoefficientMap));
if (cachedResults != null) {
LOGGER.debug("Got hit on cached ERO results: %d == %d", newRxnId, cachedResults.getLeft());
cacheHitCounter++;
return cachedResults.getRight();
}
}
List<Molecule> substrateMolecules = new ArrayList<>();
for (Long id : rxn.getSubstrates()) {
String inchi = mapNewChemIdToInChI(id);
if (inchi == null) {
String msg = String.format("Missing inchi for new chem id %d in cache", id);
LOGGER.error(msg);
throw new RuntimeException(msg);
}
if (inchi.contains("FAKE")) {
LOGGER.debug("The inchi is a FAKE, so just ignore the chemical.");
continue;
}
Molecule mol;
try {
mol = MolImporter.importMol(blacklistedInchisCorpus.renameInchiIfFoundInBlacklist(inchi));
// We had to clean the molecule after importing since based on our testing, the RO only matched the molecule
// once we cleaned it. Else, the RO did not match the chemical.
Cleaner.clean(mol, TWO_DIMENSION);
// We had to aromatize the molecule so that aliphatic related ROs do not match with aromatic compounds.
mol.aromatize(MoleculeGraph.AROM_BASIC);
} catch (chemaxon.formats.MolFormatException e) {
LOGGER.error("Error occurred while trying to import inchi %s: %s", inchi, e.getMessage());
return null;
}
/* Some ROs depend on multiple copies of a given molecule (like #165), and the Reactor won't run without all of
* those molecules available. Duplicate a molecule in the substrates list based on its coefficient in the
* reaction. */
Integer coefficient = rxn.getSubstrateCoefficient(id);
if (coefficient == null) {
// Default to just one if we don't have a clear coefficient to use.
LOGGER.warn("Converting coefficient null -> 1 for rxn %d/chem %d", newRxnId, id);
coefficient = 1;
}
for (int i = 0; i < coefficient; i++) {
substrateMolecules.add(mol);
}
}
Set<String> expectedProducts = new HashSet<>();
for (Long id : rxn.getProducts()) {
String inchi = mapNewChemIdToInChI(id);
if (inchi == null) {
String msg = String.format("Missing inchi for new chem id %d in cache", id);
LOGGER.error(msg);
throw new RuntimeException(msg);
}
if (inchi.contains("FAKE")) {
LOGGER.debug("The inchi is a FAKE, so just ignore the chemical.");
continue;
}
String transformedInchi = removeChiralityFromChemical(inchi);
if (transformedInchi == null) {
return null;
}
expectedProducts.add(transformedInchi);
}
TreeMap<Integer, List<Ero>> scoreToListOfRos = new TreeMap<>(Collections.reverseOrder());
for (Map.Entry<Ero, Reactor> entry : reactors.entrySet()) {
Integer score =
scoreReactionBasedOnRO(entry.getValue(), substrateMolecules, expectedProducts, entry.getKey(), newRxnId);
if (score > ROScore.DEFAULT_UNMATCH_SCORE.getScore()) {
List<Ero> vals = scoreToListOfRos.get(score);
if (vals == null) {
vals = new ArrayList<>();
scoreToListOfRos.put(score, vals);
}
vals.add(entry.getKey());
}
}
// Cache results for any future similar reactions.
cachedEroResults.put(Pair.of(substrateToCoefficientMap, productToCoefficientMap),
Pair.of(newRxnId, scoreToListOfRos));
return scoreToListOfRos;
}
private String removeChiralityFromChemical(String inchi) throws IOException {
try {
Molecule importedMol = MolImporter.importMol(blacklistedInchisCorpus.renameInchiIfFoundInBlacklist(inchi));
Cleaner.clean(importedMol, TWO_DIMENSION);
importedMol.aromatize(MoleculeGraph.AROM_BASIC);
return MolExporter.exportToFormat(importedMol, MOL_EXPORTER_INCHI_OPTIONS_FOR_INCHI_COMPARISON);
} catch (chemaxon.formats.MolFormatException e) {
LOGGER.error("Error occured while trying to import/export molecule from inchi %s: %s", inchi, e.getMessage());
return null;
} catch (MolExportException e) {
LOGGER.error("Error occured while trying to import/export molecule from inchi %s: %s", inchi, e.getMessage());
return null;
}
}
public void initReactors() throws IOException, LicenseProcessingException, ReactionException {
initReactors(null);
}
/**
* Tests if an ero matches a reaction in the DB, and returns the appropriate validation score based on whether it
* matches and the properties of the ero.
*
* @param reactor The reactor for the Ro.
* @param substrates The substrates of the reaction.
* @param expectedProductInchis The expected inchis from the DB.
* @param ero The RO.
* @param newRxnId The new reaction ID if a match is found.
* @return
*/
public Integer scoreReactionBasedOnRO(
Reactor reactor, List<Molecule> substrates, Set<String> expectedProductInchis, Ero ero, Long newRxnId) {
Molecule[] substrateArray = substrates.toArray(new Molecule[substrates.size()]);
List<Molecule[]> productSets;
try {
productSets = projector.getAllProjectedProductSets(substrateArray, reactor, 10);
} catch (IOException e) {
LOGGER.error("Encountered IOException when projecting reactor for ERO %d onto substrates of %d: %s",
ero.getId(), newRxnId, e.getMessage());
return ROScore.DEFAULT_UNMATCH_SCORE.getScore();
} catch (ReactionException e) {
LOGGER.error("Encountered ReactionException when projecting reactor for ERO %d onto substrates of %d: %s",
ero.getId(), newRxnId, e.getMessage());
return ROScore.DEFAULT_UNMATCH_SCORE.getScore();
}
if (productSets.isEmpty()) {
LOGGER.debug("No products were generated from the projection");
return ROScore.DEFAULT_UNMATCH_SCORE.getScore();
}
for (Molecule[] products : productSets) {
// If one of the product sets completely matches the expected product inchis set, we are confident that
// the reaction can be explained by the RO.
for (String productInchi : getInchiSet(products)) {
if (expectedProductInchis.contains(productInchi)) {
return getMatchScore(ero);
}
}
}
return ROScore.DEFAULT_UNMATCH_SCORE.getScore();
}
/**
* Returns the validation score that an ero will produce if it matches a reaction in the DB.
*
* @param ero The ero.
* @return The score.
*/
private Integer getMatchScore(Ero ero) {
if (ero.getCategory().equals(DB_PERFECT_CLASSIFICATION)) {
return ROScore.PERFECT_SCORE.getScore();
}
if (ero.getManual_validation()) {
return ROScore.MANUALLY_VALIDATED_SCORE.getScore();
}
if (ero.getManual_validation() == null) {
return ROScore.MANUALLY_NOT_VERIFIED_SCORE.getScore();
}
if (!ero.getManual_validation()) {
return ROScore.MANUALLY_INVALIDATED_SCORE.getScore();
}
return ROScore.DEFAULT_MATCH_SCORE.getScore();
}
/**
* Turns an array of molecules, i.e. a product array produced by a Reactor, into a Set of inchis
*
* @param molsArray The array of molecules.
* @return The corresponding set of inchis.
*/
private Set<String> getInchiSet(Molecule[] molsArray) {
Set<String> result = new HashSet<>();
for (Molecule product : molsArray) {
String inchi;
try {
inchi = MolExporter.exportToFormat(product, MOL_EXPORTER_INCHI_OPTIONS_FOR_INCHI_COMPARISON);
} catch (IOException e) {
LOGGER.error("Unable to export molecule to InChI, skipping: %s", e.getMessage());
continue;
}
result.add(inchi);
}
return result;
}
/**
* Validate a single reaction by its ID.
*
* Important: do not call this on an object that has been/will be used to validate an entire DB (via the `run` method,
* for example). The two approaches to validation use the same cache objects which will be corrupted if the object
* is reused (hence this method being protected).
*
* @param rxnId The id of the reaction to validate.
* @return Scored ERO projection results or null if an error occurred.
* @throws IOException
*/
public Map<Integer, List<Ero>> validateOneReaction(Long rxnId) throws IOException {
Reaction rxn = getNoSQLAPI().readReactionFromInKnowledgeGraph(rxnId);
if (rxn == null) {
LOGGER.error("Could not find reaction %d in the DB", rxnId);
return null;
}
Set<Long> allChemicalIds = new HashSet<>();
allChemicalIds.addAll(Arrays.asList(rxn.getSubstrates()));
allChemicalIds.addAll(Arrays.asList(rxn.getProducts()));
allChemicalIds.addAll(Arrays.asList(rxn.getSubstrateCofactors()));
allChemicalIds.addAll(Arrays.asList(rxn.getProductCofactors()));
for (Long id : allChemicalIds) {
Chemical chem = getNoSQLAPI().readChemicalFromInKnowledgeGraph(id);
if (chem == null) {
LOGGER.error("Unable to find chemical %d for reaction %d in the DB", id, rxnId);
return null;
}
// Simulate chemical migration so we play nicely with the validator.
getOldChemIdToNewChemId().put(id, id);
getNewChemIdToInchi().put(id, chem.getInChI());
}
return findBestRosThatCorrectlyComputeTheReaction(rxn, rxnId);
}
}