/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.desalting;
import chemaxon.calculations.hydrogenize.Hydrogenize;
import chemaxon.formats.MolExporter;
import chemaxon.formats.MolImporter;
import chemaxon.license.LicenseManager;
import chemaxon.license.LicenseProcessingException;
import chemaxon.reaction.ReactionException;
import chemaxon.reaction.Reactor;
import chemaxon.struc.Molecule;
import chemaxon.struc.PeriodicSystem;
import com.act.biointerpretation.Utils.ReactionProjector;
import org.apache.commons.lang.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Desalter tries to remove any ionization or secondary ions from an inchi.
* To use, create an instance of Desalter then use the clean method
* to convert one inchi to a desalted version. One Desalter can be reused.
*
* Desalting is the process of standardizing ionized variants of a Molecule.
* It also splits multi-component reactions into multiple entities.
* Desalter currently uses Indigo for RO Projection, and this needs to
* be replaced with ChemAxon.
*
* Desalter does all the business logic of inputting an inchi and outputting one
* or more desalted versions of it (the "clean" method). Though it does a little
* more than apply ROs, the most essential piece of the code is the ROs, which are
* stored in a file called com/act/biointerpretation/desalting/desalting_ros.json.
*
* That RO file also includes tests. Running Desalter.main() directly will execute
* these tests. They should all pass except one case in the title called secondary_ammoniums.
* TODO: We have parked this test case and will get back to it once later during development.
* {
* "input": "InChI=1S/C12H11N.C7H8O3S/c1-3-7-11(8-4-1)13-12-9-5-2-6-10-12;1-6-2-4-7(5-3-6)11(8,9)10/h1-10,13H;2-5H,1H3,(H,8,9,10)",
* "expected": "InChI=1S/C12H11N/c1-3-7-11(8-4-1)13-12-9-5-2-6-10-12/h1-10,13H",
* "label": "N-Phenylanilinium tosylate"
* }
*
* There is a second file (com/act/biointerpretation/desalting/desalter_constants.txt)
* that are additional tests which are also executed by this class.
*
* The main method also pulls 10000 entries from the database and bin each one
* based on the result: (caused an error, got modified, didn't get modified, was
* split into multiple inchis). I've gone through these lists somewhat and for the
* most part found no errors. There are some edge cases (specifically
* porphyrins and some rare ions like C[N-]) that are not handled
* currently. I have also performed this analysis on 10000 entries that
* are not necessarily in Reactions, and those looked fine too. After
* running ReactionDesalter on Dr. Know and creating synapse, I examined
* 1000 reaction entries from synapse. I looked at all the instances of
* "+" in the SMILES and found no errors. I also inspected the first 200
* in detail to confirm that no chemicals were lost relative to the text
* description.
*
* TODO: Edge cases that remain to be handled are: radioactive. heme
* See Desalter_modified_alldb_checked for examples of errors that remain
*
* TODO: Add as positive tests the 'ok' things in Desalter_modified_alldb_checked
*
* TODO: filter InChIs by string components before actually desalting for better efficiency.
*
* Note: to use the Chemaxon desalter, you'll need to have a Chemaxon license file installed in your home directory.
* To do this, run (after connecting to the NAS):
* $ MNT_SHARED_DATA/3rdPartySoftware/Chemaxon/marvinbeans/bin/license [path to a valid license file]
* This will copy the license to ~/.chemaxon/license.cxl, which the Chemaxon libraries will find automatically when
* the license manager is invoked.
*/
public class Desalter {
private static final DesaltingROCorpus DESALTING_CORPUS_ROS = new DesaltingROCorpus();
private static final Integer MAX_NUMBER_OF_ROS_TRANSFORMATION_ITERATIONS = 1000;
private static final Logger LOGGER = LogManager.getLogger(Desalter.class);
private static final String INFINITE_LOOP_DETECTED_EXCEPTION_STRING = "The algorithm has encountered a loop for this " +
"set of transformations %s on this transformed inchi: %s";
private static final Integer PRODUCT_TO_CHOOSE_INDEX = 0;
public static class InfiniteLoopDetectedException extends Exception {
public InfiniteLoopDetectedException(String message) {
super(message);
}
}
private ReactionProjector projector;
public Desalter(ReactionProjector projector) {
this.projector = projector;
}
/**
* This function desalts a given inchi representation of a molecule by first preprocessing the molecule by taking
* out extra representations like free radicals, only processing organics or a subset of an inorganic molecule
* and then desalting those component only.
*
* @param inchi The inchi representation of the chemical
* @return A set of desalted compounds within the input chemical
* @throws Exception
*/
public Map<String, Integer> desaltInchi(String inchi)
throws InfiniteLoopDetectedException, IOException, ReactionException {
// Resolve the smiles to only those that are 2-carbon units.
// Do not store the output of MolImporter, as the object will be destroyed during fragmentation.
List<Molecule> resolved = resolveMixtureOfAtomicComponents(MolImporter.importMol(inchi));
// Clean each compound
final List<Molecule> desaltedAndDeionized = new ArrayList<>(resolved.size());
for (Molecule organicOrBiggestInorganicMass : resolved) {
// Make hydrogens explicit
Molecule desaltedChemicalFragment = applyROsToMolecule(organicOrBiggestInorganicMass, inchi, true);
desaltedAndDeionized.add(desaltedChemicalFragment);
}
// Don't combine fragments in order to match Indigo behavior, but preserve the count of each desalted fragment.
return mols2InchiCounts(desaltedAndDeionized);
}
/**
* Splits molecule into fragments, desalts each fragments, returns results in a list.
* This is intended to be used for the abstract reaction -> SAR pipeline. Here, it is essential that we don't muck
* with the implicitness/explicitness of the hydrogens. That is why we work only at the molecule level, and that is
* the only difference in the logic between this and desalteInchi().
* @param molecule
* @return
* @throws IOException
*/
public List<Molecule> desaltMoleculeForAbstractReaction(Molecule molecule)
throws IOException, InfiniteLoopDetectedException, ReactionException {
// Resolve the smiles to only those that are 2-carbon units.
// Do not store the output of MolImporter, as the object will be destroyed during fragmentation.
List<Molecule> resolved = resolveMixtureOfAtomicComponents(molecule);
// Clean each compound
final List<Molecule> desaltedAndDeionized = new ArrayList<>(resolved.size());
for (Molecule organicOrBiggestInorganicMass : resolved) {
// do not make hydrogens explicit
Molecule desaltedChemicalFragment = applyROsToMolecule(organicOrBiggestInorganicMass, "NO_NAMES_YET", false);
desaltedAndDeionized.add(desaltedChemicalFragment);
}
// Don't combine fragments in order to match Indigo behavior, but preserve the count of each desalted fragment.
return desaltedAndDeionized;
}
// See https://docs.chemaxon.com/display/FF/InChi+and+InChiKey+export+options for MolExporter options.
// See also https://www.chemaxon.com/forum/ftopic10424.html for why we want to use SAbs.
public static final String MOL_EXPORTER_INCHI_OPTIONS = new StringBuilder("inchi:").
append("SAbs").append(','). // Force absolute stereo to ensure standard InChIs are produced.
append("AuxNone").append(','). // Don't write the AuxInfo block--it just gets in the way.
append("Woff"). // Disable warnings. We'll catch any exceptions this produces, but don't care about warnings.
toString();
public static String mol2Inchi(Molecule mol) throws IOException {
return MolExporter.exportToFormat(mol, MOL_EXPORTER_INCHI_OPTIONS);
}
public static List<String> mols2Inchis(List<Molecule> mols) throws IOException {
List<String> inchis = new ArrayList<>(mols.size());
for (Molecule mol : mols) {
inchis.add(mol2Inchi(mol));
}
return inchis;
}
public static Map<String, Integer> mols2InchiCounts(List<Molecule> mols) throws IOException {
Map<String, Integer> inchiCounts = new LinkedHashMap<>(mols.size()); // Preserve order as well as count.
for (Molecule mol : mols) {
String inchi = mol2Inchi(mol);
Integer count = inchiCounts.get(inchi);
inchiCounts.put(inchi, (count == null ? 0 : count) + 1);
}
return inchiCounts;
}
/**
* This function desalts an input inchi chemical by running it through a list of curated desalting ROs in a loop
* and transforms the inchi till it reaches a stable state.
* @param baseMolecule The molecule on which to project desalting ROs.
* @param inchi The inchi for the base molecule, used for logging.
* @return The desalted inchi chemical
* @throws Exception
*/
protected Molecule applyROsToMolecule(Molecule baseMolecule, String inchi, boolean makeHydrogensExplicit)
throws IOException, InfiniteLoopDetectedException, ReactionException {
projector.clearInchiCache(); // Clear cache on each new base molecule.
Molecule transformedMolecule = baseMolecule;
/* Add explicit hydrogens before projecting to ensure that the hydrogens in the ROs have something to match against.
* Note that this didn't seem to actually have much (if any) effect on the InChIs used to validate the desalter
* (the Reactor seems to work out implicit hydrogens itself), but it shouldn't cause any harm either. */
if (makeHydrogensExplicit) {
Hydrogenize.convertImplicitHToExplicit(baseMolecule);
}
// Then try all the ROs
Set<Molecule> bagOfTransformedMolecules = new LinkedHashSet<>();
int counter = 0;
while (counter < MAX_NUMBER_OF_ROS_TRANSFORMATION_ITERATIONS) {
boolean foundEffectiveReaction = false;
for (DesaltingRO ro : corpus.getRos()) {
Molecule product = getFirstPredictedProduct(baseMolecule, ro);
// If there are no products from this transformation, skip to the next RO.
if (product == null) {
continue;
}
// Break out as soon as we find an RO that imparts some sort of change.
if (!product.equals(transformedMolecule)) {
transformedMolecule = product;
foundEffectiveReaction = true;
break;
}
}
// Assume that if all of the transformations returned an equivalent molecule or null then we're done running ROs.
if (transformedMolecule == null) {
transformedMolecule = baseMolecule;
}
// If the transformed inchi is the same as the input inchi, we have reached a stable state in the chemical
// transformation process, therefore break out of the loop.
if (!foundEffectiveReaction) {
break;
}
// If we see a similar transformed inchi as an earlier transformation, we know that we have enter a cyclical
// loop that will go on to possibly infinity. Hence, we throw when such a situation happens.
if (bagOfTransformedMolecules.contains(transformedMolecule)) {
String generatedChemicalTransformations =
StringUtils.join(mols2Inchis(new ArrayList<>(bagOfTransformedMolecules)), " -> ");
String transformedInchi = mol2Inchi(transformedMolecule);
String msg =
String.format(INFINITE_LOOP_DETECTED_EXCEPTION_STRING, generatedChemicalTransformations, transformedInchi);
LOGGER.error(msg);
throw new InfiniteLoopDetectedException(msg);
} else {
if (transformedMolecule != null) {
bagOfTransformedMolecules.add(transformedMolecule);
}
}
baseMolecule = transformedMolecule;
counter++;
}
LOGGER.debug("%d transformations for %s", counter, inchi);
return transformedMolecule;
}
private Map<DesaltingRO, Reactor> reactors;
private DesaltingROCorpus corpus;
public void initReactors(File licenseFile) throws IOException, LicenseProcessingException, ReactionException {
if (licenseFile != null) {
LicenseManager.setLicenseFile(licenseFile.getAbsolutePath());
}
this.corpus = DESALTING_CORPUS_ROS.getDesaltingROS();
List<DesaltingRO> ros = corpus.getRos();
reactors = new HashMap<>(ros.size());
for (DesaltingRO ro : ros) {
Reactor reactor = new Reactor();
reactor.setReactionString(ro.getReaction());
reactors.put(ro, reactor);
}
}
public void initReactors() throws IOException, LicenseProcessingException, ReactionException {
initReactors(null);
}
private static List<Molecule> resolveMixtureOfAtomicComponents(Molecule molecule) {
List<Molecule> fragments = Arrays.asList(molecule.convertToFrags());
// Just return the whole molecule if there's only one fragment.
if (fragments.size() == 1) {
return fragments;
}
List<Molecule> resolvedFragments = new ArrayList<>(fragments.size());
// Search for any carbon-containing fragments of the molecule.
for (Molecule fragment : fragments) {
if (fragment.getAtomCount(PeriodicSystem.C) > 0) {
resolvedFragments.add(fragment);
}
}
// If we have at least one organic component, return that.
if (resolvedFragments.size() > 0) {
return resolvedFragments;
}
// Return the largest component of inorganic molecules.
Molecule fragmentWithHighestMass = null;
double highestMass = 0.0;
for (Molecule fragment : fragments) {
double mass = fragment.getExactMass();
if (fragmentWithHighestMass == null || mass > highestMass) {
fragmentWithHighestMass = fragment;
highestMass = mass;
}
}
return Collections.singletonList(fragmentWithHighestMass);
}
/**
* This function takes as input a Molecule and a Reactor and outputs the first possible product of the transformation.
*
* @param mol A Molecule representing the chemical reactant.
* @param ro A Reactor representing the reaction to apply.
* @return The product of the reaction, or null if no product is produced.
* @throws ReactionException
*/
private Molecule getFirstPredictedProduct(Molecule mol, DesaltingRO ro) throws ReactionException, IOException {
Reactor reactor = reactors.get(ro);
List<Molecule[]> productSets = projector.getAllProjectedProductSets(new Molecule[]{mol}, reactor);
if (productSets.isEmpty()) {
return null;
}
Molecule[] firstProducts = productSets.get(PRODUCT_TO_CHOOSE_INDEX);
if (firstProducts.length != 1) {
LOGGER.error("Reactor returned invalid number of products (%d), returning null.", productSets.size());
return null;
}
return firstProducts[0];
}
/**
* This function converts an input inchi to a smile representation of the chemical
*
* @param inchi The inchi representation of the chemical
* @return The smile representation of the chemical
*/
public String InchiToSmiles(String inchi) {
try {
return MolExporter.exportToFormat(MolImporter.importMol(inchi), "smiles");
} catch (Exception err) {
LOGGER.error(String.format("Error converting inchi %s to SMILES: %s\n", inchi, err.getMessage()));
return null;
}
}
}