/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.analysis.surfactant;
import chemaxon.calculations.clean.Cleaner;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.marvin.calculations.HlbPlugin;
import chemaxon.marvin.calculations.LogPMethod;
import chemaxon.marvin.calculations.MajorMicrospeciesPlugin;
import chemaxon.marvin.calculations.logPPlugin;
import chemaxon.marvin.calculations.pKaPlugin;
import chemaxon.marvin.plugin.PluginException;
import chemaxon.marvin.space.MSpaceEasy;
import chemaxon.marvin.space.MolecularSurfaceComponent;
import chemaxon.marvin.space.MoleculeComponent;
import chemaxon.marvin.space.SurfaceColoring;
import chemaxon.marvin.space.SurfaceComponent;
import chemaxon.struc.DPoint3;
import chemaxon.struc.MolAtom;
import chemaxon.struc.MolBond;
import chemaxon.struc.Molecule;
import com.chemaxon.calculations.solubility.SolubilityCalculator;
import com.chemaxon.calculations.solubility.SolubilityResult;
import com.chemaxon.calculations.solubility.SolubilityUnit;
import com.dreizak.miniball.highdim.Miniball;
import com.dreizak.miniball.model.ArrayPointSet;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.stat.regression.RegressionResults;
import org.apache.commons.math3.stat.regression.SimpleRegression;
import javax.swing.JFrame;
import javax.swing.WindowConstants;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class SurfactantAnalysis {
String inchi;
logPPlugin plugin = new logPPlugin();
MajorMicrospeciesPlugin microspeciesPlugin = new MajorMicrospeciesPlugin();
Molecule mol;
// MolAtom objects don't seem to record their index in the parent molecule, so we'll build a mapping here.
Map<MolAtom, Integer> atomToIndexMap = new HashMap<>();
// Atom indices for the longest vector between any two atoms in the molecule.
Integer lvIndex1;
Integer lvIndex2;
// Coordinates with lvIndex1 treated as the origin.
List<DPoint3> normalizedCoordinates;
Map<Integer, Double> distancesFromLongestVector = new HashMap<>();
Map<Integer, Double> distancesAlongLongestVector = new HashMap<>();
Map<Integer, Plane> normalPlanes = new HashMap<>();
// Atoms with max/min logP values.
Integer maxLogPIndex;
Integer minLogPIndex;
public enum FEATURES {
// Whole-molecule features
LOGP_TRUE,
//TODO:
//LOGD_7_4,
//LOGD_2_5,
//LOGD_RATIO,
// Plane split features
PS_LEFT_MEAN_LOGP,
PS_RIGHT_MEAN_LOGP,
PS_LR_SIZE_DIFF_RATIO,
PS_LR_POS_NEG_RATIO_1, // Left neg / right pos
PS_LR_POS_NEG_RATIO_2, // Right neg / left pos
PS_ABS_LOGP_DIFF,
PS_ABS_LOGP_SIGNS_DIFFER,
PS_WEIGHTED_LOGP_DIFF,
PS_WEIGHTED_LOGP_SIGNS_DIFFER,
PS_MAX_ABS_DIFF, // This should be equivalent to the old split metric from the DARPA report (I hope).
PS_LEFT_POS_NEG_RATIO,
PS_RIGHT_POS_NEG_RATIO,
// Regression features
REG_WEIGHTED_SLOPE,
REG_WEIGHTED_INTERCEPT,
REG_VAL_AT_FARTHEST_POINT,
REG_CROSSES_X_AXIS,
REG_ABS_SLOPE,
// Geometric features,
GEO_LV_FD_RATIO,
// Extreme neighborhood features
NBH_MAX_AND_MIN_TOGETHER,
NBH_MAX_IN_V1,
NBH_MAX_IN_V2,
NBH_MIN_IN_V1,
NBH_MIN_IN_V2,
NBH_MAX_N_MEAN,
NBH_MIN_N_MEAN,
NBH_MAX_POS_RATIO,
NBH_MIN_NEG_RATIO,
// Solubility features
SOL_MG_ML_25,
SOL_MG_ML_30,
SOL_MG_ML_35,
// pKa features
PKA_ACID_1, PKA_ACID_1_IDX,
PKA_ACID_2, PKA_ACID_2_IDX,
PKA_ACID_3, PKA_ACID_3_IDX,
PKA_BASE_1, PKA_BASE_1_IDX,
PKA_BASE_2, PKA_BASE_2_IDX,
PKA_BASE_3, PKA_BASE_3_IDX,
// HBL features
HLB_VAL,
}
public SurfactantAnalysis() { }
/**
* Imports a molecule and runs essential calculations (like logP).
* @param inchi The InChI of a molecule to be imported.
* @throws MolFormatException
* @throws PluginException
* @throws IOException
*/
public void init(String inchi)
throws MolFormatException, PluginException, IOException {
this.inchi = inchi;
Molecule importMol = MolImporter.importMol(this.inchi);
Cleaner.clean(importMol, 3); // This will assign 3D atom coordinates to the MolAtoms in this.mol.
plugin.standardize(importMol);
// Note: this doesn't seem to have any effect, but we'll try anyway for our current use case.
microspeciesPlugin.setpH(1.5);
microspeciesPlugin.setMolecule(importMol);
microspeciesPlugin.run();
Molecule phMol = microspeciesPlugin.getMajorMicrospecies();
plugin.setlogPMethod(LogPMethod.CONSENSUS);
// TODO: do we need to explicitly specify ion concentration?
plugin.setUserTypes("logPTrue,logPMicro,logPNonionic"); // These arguments were chosen via experimentation.
plugin.setMolecule(phMol);
plugin.run();
this.mol = plugin.getResultMolecule();
// The logP values exposed by the plugin are only accessible by index; make an object -> id map for easier lookup.
MolAtom[] molAtoms = mol.getAtomArray();
for (int i = 0; i < molAtoms.length; i++) {
atomToIndexMap.put(molAtoms[i], i);
}
}
/**
* Finds the pair of most distant atoms that contribute to the molecule's logP value.
* @return A pair of atom indices for the two most distant atoms in the molecule.
*/
public Pair<Integer, Integer> findFarthestContributingAtomPair() {
Double maxDist = 0.0d;
Integer di1 = null, di2 = null; // Endpoint atoms of the diameter of the structure.
for (int i = 0; i < mol.getAtomCount(); i++) {
if (Double.isNaN(plugin.getAtomlogPIncrement(i))) {
continue;
}
for (int j = 0; j < mol.getAtomCount(); j++) {
if (i == j) {
continue;
}
if (Double.isNaN(plugin.getAtomlogPIncrement(j))) {
continue;
}
MolAtom m1 = mol.getAtom(i);
MolAtom m2 = mol.getAtom(j);
DPoint3 c1 = m1.getLocation();
DPoint3 c2 = m2.getLocation();
Double dist = c1.distance(c2);
if (dist > maxDist) {
maxDist = dist;
di1 = i;
di2 = j;
}
}
}
this.lvIndex1 = di1;
this.lvIndex2 = di2;
this.normalizedCoordinates = resetOriginForCoordinates(di1);
return Pair.of(di1, di2);
}
/**
* Compute the distance between two atoms in the molecule being analyzed.
* @param a1 The index of one atom.
* @param a2 The index of the other atom.
* @return A distance (units not specified) between the two atoms in the molecule's coordinate space.
*/
public Double computeDistance(Integer a1, Integer a2) {
return this.normalizedCoordinates.get(a1).distance(this.normalizedCoordinates.get(a2));
}
/**
* Recenters all atomic coordinates around a new origin.
* @param newOriginIndex The atom index to use as a new origin.
* @return A list of coordinates for all atoms using the specified atom as the origin.
*/
public List<DPoint3> resetOriginForCoordinates(Integer newOriginIndex) {
DPoint3 newOrigin = mol.getAtom(newOriginIndex).getLocation();
List<DPoint3> coords = new ArrayList<>();
for (int i = 0; i < mol.getAtomCount(); i++) {
DPoint3 c = mol.getAtom(i).getLocation();
c.subtract(newOrigin);
coords.add(c);
}
return coords;
}
public static class Plane {
public double a;
public double b;
public double c;
public double d;
public Plane(double a, double b, double c, double d) {
this.a = a;
this.b = b;
this.c = c;
this.d = d;
}
public double computeProductForPoint(double x, double y, double z) {
return a * x + b * y + c * z + d;
}
}
/**
* Computes an atom's projection onto `lv` and the `lv`-normal plane that intersects that projection, where `lv` is
* the vector between the pair of most distant atoms in the molecule.
*
* @return Maps of atomic indices to distances from `lv` and to an `lv`-normal plane that intersects that molecule.
*/
public Pair<Map<Integer, Double>, Map<Integer, Plane>> computeAtomDistanceToLongestVectorAndNormalPlanes() {
List<DPoint3> coords = this.normalizedCoordinates;
for (int i = 0; i < mol.getAtomCount(); i++) {
if (i == lvIndex1 || i == lvIndex2) {
continue;
}
DPoint3 diameter = coords.get(lvIndex2);
DPoint3 exp = coords.get(i);
Double dotProduct = diameter.x * exp.x + diameter.y * exp.y + diameter.z * exp.z;
Double lengthProduct = Math.sqrt(diameter.lengthSquare()) * Math.sqrt(exp.lengthSquare());
Double cosine = dotProduct / lengthProduct;
Double sine = Math.sqrt(1 - cosine * cosine);
Double vLength = Math.sqrt(exp.lengthSquare());
Double perpendicularDist = sine * vLength;
Double proj = cosine * vLength;
distancesFromLongestVector.put(i, perpendicularDist);
distancesAlongLongestVector.put(i, proj);
normalPlanes.put(i, new Plane(diameter.x, diameter.y, diameter.z, -1d * dotProduct));
}
distancesFromLongestVector.put(lvIndex1, 0.0);
distancesFromLongestVector.put(lvIndex2, 0.0);
distancesAlongLongestVector.put(lvIndex1, 0.0);
distancesAlongLongestVector.put(lvIndex2, Math.sqrt(coords.get(lvIndex2).lengthSquare()));
return Pair.of(distancesFromLongestVector, normalPlanes);
}
/**
* Computes sets of atoms on either side of each `lv`-normal plane defined by each atom.
* @return A map of atom index to lists of atoms on each side of the atom-incident `lv`-normal plane.
*/
public Map<Integer, Pair<List<Integer>, List<Integer>>> splitAtomsByNormalPlanes() {
List<DPoint3> coords = resetOriginForCoordinates(lvIndex1);
Map<Integer, Pair<List<Integer>, List<Integer>>> results = new HashMap<>();
for (int i = 0; i < mol.getAtomCount(); i++) {
Plane p = normalPlanes.get(i);
if (p == null) {
continue;
}
List<Integer> negSide = new ArrayList<>();
List<Integer> posSide = new ArrayList<>();
for (int j = 0; j < mol.getAtomCount(); j++) {
if (i == j) {
continue;
}
DPoint3 c = coords.get(j);
double prod = p.computeProductForPoint(c.x, c.y, c.z);
// It seems unlikely that an atom would be coplanar to the dividing atom, but who knows. Throw it in pos if so.
if (prod < 0.0000d) {
negSide.add(j);
} else {
posSide.add(j);
}
}
results.put(i, Pair.of(negSide, posSide));
}
return results;
}
/**
* Computes the minimum bounding ball around a list of coordinates.
* @param coords A list of coordinates whose minimum bounding ball to compute.
* @return A center and radius of the minimum bounding ball for the specified list of points.
*/
public Pair<DPoint3, Double> computeMinimumBoundingBall(List<DPoint3> coords) {
ArrayPointSet aps = new ArrayPointSet(3, coords.size());
for (int i = 0; i < coords.size(); i++) {
DPoint3 c = coords.get(i);
aps.set(i, 0, c.x);
aps.set(i, 1, c.y);
aps.set(i, 2, c.z);
}
Miniball mb = new Miniball(aps);
double[] c = mb.center();
DPoint3 center = new DPoint3(c[0], c[1], c[2]);
return Pair.of(center, mb.radius());
}
/**
* Contribute the minimum bounding ball for all atoms that contribute the the molecule's logP value.
* @return A center and raidus for the minimum bounding ball around logP-contributing atoms.
*/
public Pair<DPoint3, Double> computeMinimumBoundingBallForContributingAtoms() {
MolAtom[] atoms = mol.getAtomArray();
List<DPoint3> coords = new ArrayList<>(atoms.length);
for (int i = 0; i < atoms.length; i++) {
// Ignore atoms that don't contribute to the logP value (i.e. have a NaN LogP value).
if (Double.isNaN(plugin.getAtomlogPIncrement(i))) {
continue;
}
coords.add(atoms[i].getLocation());
}
return computeMinimumBoundingBall(coords);
}
/**
* Explore the neighborhood within `depths` steps of the atom with the specified atomic index, returning a map of
* neighboring atomic indices to their step-wise distance from the specified origin atom.
*
* @param index The index of the atom whose neighborhood to explore.
* @param depth The maximum number of steps to take away from the origin atom.
* @return A map of atomic index to step-wise distance from the specified origin atom.
*/
public Map<Integer, Integer> exploreNeighborhood(int index, int depth) {
return exploreNeighborhoodHelper(index, depth, depth, new HashMap<>());
}
// Recursively walk the atom's neighborhood.
private Map<Integer, Integer> exploreNeighborhoodHelper(int index, int baseDepth, int depth,
Map<Integer, Integer> atomsAndDepths) {
if (!atomsAndDepths.containsKey(index)) {
atomsAndDepths.put(index, baseDepth - depth);
}
if (depth <= 0) {
return atomsAndDepths;
}
MolAtom d1 = mol.getAtom(index);
MolBond[] d1bonds = d1.getBondArray();
for (MolBond b : d1bonds) {
MolAtom dest;
if (b.getAtom1().equals(d1)) {
dest = b.getAtom2();
} else {
dest = b.getAtom1();
}
int desti = atomToIndexMap.get(dest);
if (!atomsAndDepths.containsKey(desti)) {
atomsAndDepths = exploreNeighborhoodHelper(desti,baseDepth, depth - 1, atomsAndDepths);
}
}
return atomsAndDepths;
}
public static final Double MIN_AND_MAX_LOG_P_LONGEST_VECTOR_BOOST = 0.00001;
/**
* Walk bonds from the lv endpoints and min/max logP atoms, computing stats about their makeup.
*
* @return A map of features to numeric values for extreme-neighborhood type attributes (NBH_*).
*/
public Map<FEATURES, Double> exploreExtremeNeighborhoods() {
Integer vMax = null, vMin = null;
double lpMax = 0.0, lpMin = 0.0;
for (int i = 0; i < mol.getAtomCount(); i++) {
double lp = plugin.getAtomlogPIncrement(i);
if (i == lvIndex1 || i == lvIndex2) {
// Boost the most distant points by a little bit to break ties.
lp = lp > 0.0 ? lp + MIN_AND_MAX_LOG_P_LONGEST_VECTOR_BOOST : lp - MIN_AND_MAX_LOG_P_LONGEST_VECTOR_BOOST;
}
if (vMax == null || lp > lpMax) {
vMax = i;
lpMax = lp;
}
if (vMin == null || lp < lpMin) {
vMin = i;
lpMin = lp;
}
}
maxLogPIndex = vMax;
minLogPIndex = vMin;
Map<Integer, Integer> maxNeighborhood = exploreNeighborhood(vMax, 2);
Map<Integer, Integer> minNeighborhood = exploreNeighborhood(vMin, 2);
Map<Integer, Integer> v1Neighborhood = exploreNeighborhood(lvIndex1, 2);
Map<Integer, Integer> v2Neighborhood = exploreNeighborhood(lvIndex2, 2);
boolean maxAndMinInSimilarNeighborhood = maxNeighborhood.containsKey(vMin);
boolean maxInV1N = v1Neighborhood.containsKey(vMax);
boolean maxInV2N = v2Neighborhood.containsKey(vMax);
boolean minInV1N = v1Neighborhood.containsKey(vMin);
boolean minInV2N = v2Neighborhood.containsKey(vMin);
// These odd *_ accumulators are because the vars used in the put() calls for the return value need to be final.
double maxNSum_ = 0.0;
int maxNWithPosSign_ = 0;
for (Integer i : maxNeighborhood.keySet()) {
double logp = plugin.getAtomlogPIncrement(i);
maxNSum_ += logp;
if (logp >= 0.0) {
maxNWithPosSign_++;
}
}
double maxNSum = maxNSum_;
double maxNWithPosSign = Integer.valueOf(maxNWithPosSign_).doubleValue();
double minNSum_ = 0.0;
int minNWithNegSign_ = 0;
for (Integer i : minNeighborhood.keySet()) {
double logp = plugin.getAtomlogPIncrement(i);
minNSum_ += logp;
if (logp <= 0.0) {
minNWithNegSign_++;
}
}
double minNSum = minNSum_;
double minNWithNegSign = Integer.valueOf(minNWithNegSign_).doubleValue();
return new HashMap<FEATURES, Double>() {{
put(FEATURES.NBH_MAX_AND_MIN_TOGETHER, maxAndMinInSimilarNeighborhood ? 1.0 : 0);
put(FEATURES.NBH_MAX_IN_V1, maxInV1N ? 1.0 : 0); // Boolean -> float makes this friendly to downstream analysis.
put(FEATURES.NBH_MAX_IN_V2, maxInV2N ? 1.0 : 0);
put(FEATURES.NBH_MIN_IN_V1, minInV1N ? 1.0 : 0);
put(FEATURES.NBH_MIN_IN_V2, minInV2N ? 1.0 : 0);
put(FEATURES.NBH_MAX_N_MEAN, maxNSum / Integer.valueOf(maxNeighborhood.size()).doubleValue());
put(FEATURES.NBH_MIN_N_MEAN, minNSum / Integer.valueOf(maxNeighborhood.size()).doubleValue());
put(FEATURES.NBH_MAX_POS_RATIO, maxNWithPosSign / Integer.valueOf(maxNeighborhood.size()).doubleValue());
put(FEATURES.NBH_MIN_NEG_RATIO, minNWithNegSign / Integer.valueOf(minNeighborhood.size()).doubleValue());
}};
}
/**
* Perform linear regression over atoms' projection onto `lv` using their logP contributions as y-axis values.
*
* @return The slope of the regression line computed over the `lv`-projection.
*/
public Double performRegressionOverLVProjectionOfLogP() {
SimpleRegression regression = new SimpleRegression();
for (int i = 0; i < mol.getAtomCount(); i++) {
Double x = distancesAlongLongestVector.get(i);
Double y = plugin.getAtomlogPIncrement(i);
regression.addData(x, y);
}
regression.regress();
return regression.getSlope();
}
/**
* Perform linear regression over a list of X/Y coordinates
* @param coords A set of coordinates over which to perform linear regression.
* @return The slope and intercept of the regression line.
*/
public Pair<Double, Double> performRegressionOverXYPairs(List<Pair<Double, Double>> coords) {
SimpleRegression regression = new SimpleRegression(true);
for (Pair<Double, Double> c : coords) {
regression.addData(c.getLeft(), c.getRight());
}
// Note: the regress() call can raise an exception for small molecules. We should probably handle that gracefully.
RegressionResults result = regression.regress();
return Pair.of(regression.getSlope(), regression.getIntercept());
}
/**
* Computes plane-split (PS_*_) features for a list of AtomSplit objects, and returns the one that best separates
* positivie and negative logP-contributing atoms.
* @param atomSplits A list of atom splits for which to compute features.
* @return A pair of the best AtomSplit and its features.
*/
public Pair<AtomSplit, Map<FEATURES, Double>> findBestPlaneSplitFeatures(List<AtomSplit> atomSplits) {
double bestWeightedLogPDiff = 0.0;
AtomSplit bestAtomSplit = null;
Map<FEATURES, Double> features = null;
// Compute a bunch of metrics for every split, and take the one that best partitions the weighted logP delta.
for (AtomSplit ps : atomSplits) {
double absLogPDiff = Math.abs(ps.getLeftSum() - ps.getRightSum());
double absLogPSignDiff = ps.getLeftSum() * ps.getRightSum() < 0.000 ? 1.0 : 0.0;
double absLogPMinMaxDiff = Math.max(
ps.getLeftMax() - ps.getRightMin(),
ps.getRightMax() - ps.getLeftMin());
double weightedLogPDiff = Math.abs(ps.getWeightedLeftSum() - ps.getWeightedRightSum());
double weightedLogPSignDiff = ps.getWeightedLeftSum() * ps.getWeightedRightSum() < 0.000 ? 1.0 : 0.0;
int leftSize = ps.getLeftIndices().size();
int rightSize = ps.getRightIndices().size();
double lrSetSizeDiffRatio = Math.abs(Integer.valueOf(leftSize - rightSize).doubleValue() /
Integer.valueOf(leftSize + rightSize).doubleValue());
double sizeWeightedLeftSum = ps.getLeftSum() / Integer.valueOf(Math.max(leftSize, 1)).doubleValue();
double sizeWeightedRightSum = ps.getRightSum() / Integer.valueOf(Math.max(rightSize, 1)).doubleValue();
double sizeWeightedLeftWeightedSum =
ps.getWeightedLeftSum() / Integer.valueOf(Math.max(leftSize, 1)).doubleValue();
double sizeWeightedRightWeightedSum =
ps.getWeightedRightSum() / Integer.valueOf(Math.max(rightSize, 1)).doubleValue();
double lrPosNegCountRatio1 = Integer.valueOf(ps.getLeftNegCount()).doubleValue() /
Integer.valueOf(Math.max(ps.getRightPosCount(), 1)).doubleValue();
double lrPosNegCountRatio2 = Integer.valueOf(ps.getRightNegCount()).doubleValue() /
Integer.valueOf(Math.max(ps.getLeftPosCount(), 1)).doubleValue();
double leftPosNegRatio = Integer.valueOf(Math.min(ps.getLeftNegCount(), ps.getLeftPosCount())).doubleValue() /
Integer.valueOf(Math.max(ps.getLeftNegCount(), ps.getLeftPosCount())).doubleValue();
double rightPosNegRatio = Integer.valueOf(Math.min(ps.getRightNegCount(), ps.getRightPosCount())).doubleValue() /
Integer.valueOf(Math.max(ps.getRightNegCount(), ps.getRightPosCount())).doubleValue();
if (weightedLogPDiff > bestWeightedLogPDiff) {
bestWeightedLogPDiff = weightedLogPDiff;
bestAtomSplit = ps;
// Store the features while they're computed; seems like it'd be more expensive to recompute than store.
features = new HashMap<FEATURES, Double>(){{
put(FEATURES.PS_LEFT_MEAN_LOGP, ps.getLeftSum() / Integer.valueOf(Math.max(leftSize, 1)).doubleValue());
put(FEATURES.PS_RIGHT_MEAN_LOGP, ps.getRightSum() / Integer.valueOf(Math.max(rightSize, 1)).doubleValue());
put(FEATURES.PS_LR_SIZE_DIFF_RATIO, lrSetSizeDiffRatio);
put(FEATURES.PS_LR_POS_NEG_RATIO_1, lrPosNegCountRatio1);
put(FEATURES.PS_LR_POS_NEG_RATIO_2, lrPosNegCountRatio2);
put(FEATURES.PS_ABS_LOGP_DIFF, absLogPDiff);
put(FEATURES.PS_ABS_LOGP_SIGNS_DIFFER, absLogPSignDiff);
put(FEATURES.PS_WEIGHTED_LOGP_DIFF, weightedLogPDiff);
put(FEATURES.PS_WEIGHTED_LOGP_SIGNS_DIFFER, weightedLogPSignDiff);
put(FEATURES.PS_MAX_ABS_DIFF, absLogPMinMaxDiff);
put(FEATURES.PS_LEFT_POS_NEG_RATIO, leftPosNegRatio);
put(FEATURES.PS_RIGHT_POS_NEG_RATIO, rightPosNegRatio);
// TODO: add surface-contribution-based metrics as well.
}};
}
}
return Pair.of(bestAtomSplit, features);
}
/**
* Compute features related to the logP-labeled molecular surface computed by MarvinSpace.
* @param jFrame A jFrame to use when running MarvinSpace (seems strange but is requred).
* @param hydrogensShareNeighborsLogP Set to true if hydrogen atoms should share their neighbor's logP value.
* @return A map of features related to and depending on the computed molecular surface.
* @throws Exception
*/
public Map<FEATURES, Double> computeSurfaceFeatures(JFrame jFrame, boolean hydrogensShareNeighborsLogP)
throws Exception {
// TODO: use the proper marvin sketch scene to get better rendering control instead of MSpaceEasy.
MSpaceEasy mspace = new MSpaceEasy(1, 2, true);
mspace.addCanvas(jFrame.getContentPane());
mspace.setSize(1200, 600);
ArrayList<Double> logPVals = new ArrayList<>();
ArrayList<Double> hValues = new ArrayList<>();
// Store a list of ids so we can label the atoms in the surface rendering (otherwise we won't know what's what).
ArrayList<Integer> ids = new ArrayList<>();
MolAtom[] atoms = mol.getAtomArray();
for (int i = 0; i < mol.getAtomCount(); i++) {
ids.add(i);
Double logP = plugin.getAtomlogPIncrement(i);
logPVals.add(logP);
/* The surface renderer requires that we specify logP values for all hydrogens, which don't appear to have logP
* contributions calculated for them, in addition to non-hydrogen atoms. We fake this by either borrowing the
* hydrogen's neighbor's logP value, or setting it to 0.0.
* TODO: figure out what the command-line marvin sketch logP renderer does and do that instead.
* */
MolAtom molAtom = mol.getAtom(i);
for (int j = 0; j < molAtom.getImplicitHcount(); j++) {
// Note: the logPPlugin's deprecated getAtomlogPHIncrement method just uses the non-H neighbor's logP, as here.
// msketch seems to do something different, but it's unclear what that is.
hValues.add(hydrogensShareNeighborsLogP ? logP : 0.0);
}
}
/* Tack the hydrogen's logP contributions on to the list of proper logP values. The MSC renderer seems to expect
* the hydrogen's values after the non-hydrogen's values, so appending appears to work fine. */
logPVals.addAll(hValues);
// Compute the planes before rendering to avoid the addition of implicit hydrogens in the calculation.
// TODO: re-strip hydrogens after rendering to avoid these weird issues in general.
Map<Integer, Pair<List<Integer>, List<Integer>>> splitPlanes = splitAtomsByNormalPlanes();
MoleculeComponent mc1 = mspace.addMoleculeTo(mol, 0);
mspace.getEventHandler().createAtomLabels(mc1, ids);
// Don't draw hydrogens; it makes the drawing too noisy.
mspace.setProperty("MacroMolecule.Hydrogens", "false");
MoleculeComponent mc2 = mspace.addMoleculeTo(mol, 1);
MolecularSurfaceComponent msc = mspace.computeSurface(mc2);
SurfaceComponent sc = msc.getSurface();
// Note: if we call mol.getAtomArray() here, it will contain all the implicit hydrogens.
Map<Integer, Integer> surfaceComponentCounts = new HashMap<>();
for (int i = 0; i < atoms.length; i++) {
surfaceComponentCounts.put(i, 0);
}
for (int i = 0; i < sc.getVertexCount(); i++) {
DPoint3 c = new DPoint3(sc.getVertexX(i), sc.getVertexY(i), sc.getVertexZ(i));
Double closestDist = null;
Integer closestAtom = null;
for (int j = 0; j < atoms.length; j++) {
double dist = c.distance(atoms[j].getLocation());
if (closestDist == null || closestDist > dist) {
closestDist = dist;
closestAtom = j;
}
}
surfaceComponentCounts.put(closestAtom, surfaceComponentCounts.get(closestAtom) + 1);
}
// Build a line of (proj(p, lv), logP) pairs.
List<Pair<Double, Double>> weightedVals = new ArrayList<>();
for (int i = 0; i < atoms.length; i++) {
Integer count = surfaceComponentCounts.get(i);
Double logP = plugin.getAtomlogPIncrement(i);
Double x = distancesAlongLongestVector.get(i);
Double y = count.doubleValue() * logP;
// Ditch non-contributing atoms.
if (y < -0.001 || y > 0.001) {
weightedVals.add(Pair.of(x, y));
}
}
Collections.sort(weightedVals);
Pair<Double, Double> slopeIntercept = performRegressionOverXYPairs(weightedVals);
double valAtFarthestPoint =
distancesAlongLongestVector.get(lvIndex2) * slopeIntercept.getLeft() + slopeIntercept.getRight();
Map<FEATURES, Double> features = new HashMap<>();
features.put(FEATURES.REG_WEIGHTED_SLOPE, slopeIntercept.getLeft());
features.put(FEATURES.REG_WEIGHTED_INTERCEPT, slopeIntercept.getRight());
features.put(FEATURES.REG_VAL_AT_FARTHEST_POINT, valAtFarthestPoint);
/* Multiply the intercept with the value at the largest point to see if there's a sign change. If so, we'll
* get a negative number and know the regression line crosses the axis. */
features.put(FEATURES.REG_CROSSES_X_AXIS, valAtFarthestPoint * slopeIntercept.getRight() < 0.000 ? 1.0 : 0.0);
// Flatten the list of split planes and find the "best" one (i.e. the one that maximizes the weighted logP delta).
List<AtomSplit> allSplitPlanes = new ArrayList<>();
for (int i = 0; i < atoms.length; i++) {
if (!splitPlanes.containsKey(i)) {
continue;
}
Pair<List<Integer>, List<Integer>> splitAtoms = splitPlanes.get(i);
List<Integer> leftAtoms = splitAtoms.getLeft();
List<Integer> rightAtoms = splitAtoms.getRight();
Pair<AtomSplit, AtomSplit> splitVariants = AtomSplit.computePlaneSplitsForIntersectingAtom(
leftAtoms, rightAtoms, i, plugin, surfaceComponentCounts
);
AtomSplit l = splitVariants.getLeft();
AtomSplit r = splitVariants.getRight();
allSplitPlanes.add(l);
allSplitPlanes.add(r);
}
Pair<AtomSplit, Map<FEATURES, Double>> bestPsRes = findBestPlaneSplitFeatures(allSplitPlanes);
features.putAll(bestPsRes.getRight());
msc.setPalette(SurfaceColoring.COLOR_MAPPER_BLUE_TO_RED);
msc.showVolume(true);
// These parameters were selected via experimentation.
msc.setSurfacePrecision("High");
msc.setSurfaceType("van der Waals");
msc.setDrawProperty("Surface.DrawType", "Dot");
msc.setDrawProperty("Surface.Quality", "High");
msc.setAtomPropertyList(logPVals);
msc.setDrawProperty("Surface.ColorType", "AtomProperty");
// Don't display here--leave that to the owner of the JFrame.
return features;
}
public static final double[] SOLUBILITY_PHS = new double[] {2.5, 3.0, 3.5};
/**
* Calculate whole-molecule fatures used in post-processing and filtering.
* @return A map of whole-molecule features.
* @throws Exception
*/
public Map<FEATURES, Double> calculateAdditionalFilteringFeatures() throws Exception {
SolubilityCalculator sc = new SolubilityCalculator();
SolubilityResult[] solubility = sc.calculatePhDependentSolubility(mol, SOLUBILITY_PHS);
HlbPlugin hlb = HlbPlugin.Builder.createNew();
hlb.setMolecule(mol);
hlb.run();
double hlbVal = hlb.getHlbValue();
pKaPlugin pka = new pKaPlugin();
// From the documentation. Not sure what these knobs do...
pka.setBasicpKaLowerLimit(-5.0);
pka.setAcidicpKaUpperLimit(25.0);
pka.setpHLower(2.5); // for ms distr
pka.setpHUpper(3.5); // for ms distr
pka.setpHStep(0.5); // for ms distr
pka.setMolecule(mol);
pka.run();
double[] pkaAcidVals = new double[3];
int[] pkaAcidIndices = new int[3];
double[] pkaBasicVals = new double[3];
int[] pkaBasicIndices = new int[3];
// Also not sure these are the values we're interested in.
pka.getMacropKaValues(pKaPlugin.ACIDIC, pkaAcidVals, pkaAcidIndices);
pka.getMacropKaValues(pKaPlugin.BASIC, pkaBasicVals, pkaBasicIndices);
// TODO: compute carbon chain length.
return new HashMap<FEATURES, Double>() {{
put(FEATURES.SOL_MG_ML_25, solubility[0].getSolubility(SolubilityUnit.MGPERML));
put(FEATURES.SOL_MG_ML_30, solubility[1].getSolubility(SolubilityUnit.MGPERML));
put(FEATURES.SOL_MG_ML_35, solubility[2].getSolubility(SolubilityUnit.MGPERML));
put(FEATURES.PKA_ACID_1, pkaAcidVals[0]);
put(FEATURES.PKA_ACID_1_IDX, Integer.valueOf(pkaAcidIndices[0]).doubleValue());
put(FEATURES.PKA_ACID_2, pkaAcidVals[1]);
put(FEATURES.PKA_ACID_2_IDX, Integer.valueOf(pkaAcidIndices[1]).doubleValue());
put(FEATURES.PKA_ACID_3, pkaAcidVals[2]);
put(FEATURES.PKA_ACID_3_IDX, Integer.valueOf(pkaAcidIndices[2]).doubleValue());
put(FEATURES.PKA_BASE_1, pkaBasicVals[0]);
put(FEATURES.PKA_BASE_1_IDX, Integer.valueOf(pkaBasicIndices[0]).doubleValue());
put(FEATURES.PKA_BASE_2, pkaBasicVals[1]);
put(FEATURES.PKA_BASE_2_IDX, Integer.valueOf(pkaBasicIndices[1]).doubleValue());
put(FEATURES.PKA_BASE_3, pkaBasicVals[2]);
put(FEATURES.PKA_BASE_3_IDX, Integer.valueOf(pkaBasicIndices[2]).doubleValue());
put(FEATURES.HLB_VAL, hlbVal);
}};
}
public String getInchi() {
return inchi;
}
public logPPlugin getPlugin() {
return plugin;
}
public Molecule getMol() {
return mol;
}
public Map<MolAtom, Integer> getAtomToIndexMap() {
return atomToIndexMap;
}
public Integer getLvIndex1() {
return lvIndex1;
}
public Integer getLvIndex2() {
return lvIndex2;
}
public List<DPoint3> getNormalizedCoordinates() {
return normalizedCoordinates;
}
public MajorMicrospeciesPlugin getMicrospeciesPlugin() {
return microspeciesPlugin;
}
public Map<Integer, Double> getDistancesFromLongestVector() {
return distancesFromLongestVector;
}
public Map<Integer, Double> getDistancesAlongLongestVector() {
return distancesAlongLongestVector;
}
public Map<Integer, Plane> getNormalPlanes() {
return normalPlanes;
}
// TODO: add greedy high/low logP neighborhood picking, compute bounding balls, and calc intersection (spherical cap).
// TODO: restructure this class to make the analysis steps more modular (now they're coupled to surface computation).
/**
* Perform all analysis for a molecule, returning a map of all available features.
* @param inchi The molecule to analyze.
* @param display True if the molecule should be displayed; set to false for non-interactive analysis.
* @return A map of all features for this molecule.
* @throws Exception
*/
public static Map<FEATURES, Double> performAnalysis(String inchi, boolean display) throws Exception {
SurfactantAnalysis surfactantAnalysis = new SurfactantAnalysis();
surfactantAnalysis.init(inchi);
// Start with simple structural analyses.
Pair<Integer, Integer> farthestAtoms = surfactantAnalysis.findFarthestContributingAtomPair();
Double longestVectorLength = surfactantAnalysis.computeDistance(farthestAtoms.getLeft(), farthestAtoms.getRight());
// Then compute the atom distances to the longest vector (lv) and produce lv-normal planes at each atom.
Pair<Map<Integer, Double> , Map<Integer, Plane>> results =
surfactantAnalysis.computeAtomDistanceToLongestVectorAndNormalPlanes();
// Find the max distance so we can calculate the maxDist/|lv| ratio, or "skinny" factor.
double maxDistToLongestVector = 0.0;
Map<Integer, Double> distancesToLongestVector = results.getLeft();
for (Map.Entry<Integer, Double> e : distancesToLongestVector.entrySet()) {
maxDistToLongestVector = Math.max(maxDistToLongestVector, e.getValue());
}
// A map of the molecule features we'll eventually output.
Map<FEATURES, Double> features = new HashMap<>();
// Explore the lv endpoint and min/max logP atom neighborhoods, and merge those features into the complete map.
Map<FEATURES, Double> neighborhoodFeatures = surfactantAnalysis.exploreExtremeNeighborhoods();
features.putAll(neighborhoodFeatures);
/* Perform regression analysis on the projection of the molecules onto lv, where their y-axis is their logP value.
* Higher |slope| may mean more extreme logP differences at the ends. */
Double slope = surfactantAnalysis.performRegressionOverLVProjectionOfLogP();
/* Compute the logP surface of the molecule (seems to require a JFrame?), and collect those features. We consider
* the number of closest surface components to each atom so we can guess at how much interior atoms actually
* contribute to the molecule's solubility. */
JFrame jFrame = new JFrame();
jFrame.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE);
Map<FEATURES, Double> surfaceFeatures = surfactantAnalysis.computeSurfaceFeatures(jFrame, true);
features.putAll(surfaceFeatures);
features.put(FEATURES.LOGP_TRUE, surfactantAnalysis.plugin.getlogPTrue()); // Save absolute logP since we calculated it.
features.put(FEATURES.GEO_LV_FD_RATIO, maxDistToLongestVector / longestVectorLength);
features.put(FEATURES.REG_ABS_SLOPE, slope);
Map<FEATURES, Double> additionalFeatures = surfactantAnalysis.calculateAdditionalFilteringFeatures();
features.putAll(additionalFeatures);
List<FEATURES> sortedFeatures = new ArrayList<>(features.keySet());
Collections.sort(sortedFeatures);
// Print these for easier progress tracking.
System.out.format("features:\n");
for (FEATURES f : sortedFeatures) {
System.out.format(" %s = %f\n", f, features.get(f));
}
if (display) {
jFrame.pack();
jFrame.setVisible(true);
}
return features;
}
}