SurfactantAnalysis.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.analysis.surfactant;


import chemaxon.calculations.clean.Cleaner;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.marvin.calculations.HlbPlugin;
import chemaxon.marvin.calculations.LogPMethod;
import chemaxon.marvin.calculations.MajorMicrospeciesPlugin;
import chemaxon.marvin.calculations.logPPlugin;
import chemaxon.marvin.calculations.pKaPlugin;
import chemaxon.marvin.plugin.PluginException;
import chemaxon.marvin.space.MSpaceEasy;
import chemaxon.marvin.space.MolecularSurfaceComponent;
import chemaxon.marvin.space.MoleculeComponent;
import chemaxon.marvin.space.SurfaceColoring;
import chemaxon.marvin.space.SurfaceComponent;
import chemaxon.struc.DPoint3;
import chemaxon.struc.MolAtom;
import chemaxon.struc.MolBond;
import chemaxon.struc.Molecule;
import com.chemaxon.calculations.solubility.SolubilityCalculator;
import com.chemaxon.calculations.solubility.SolubilityResult;
import com.chemaxon.calculations.solubility.SolubilityUnit;
import com.dreizak.miniball.highdim.Miniball;
import com.dreizak.miniball.model.ArrayPointSet;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.stat.regression.RegressionResults;
import org.apache.commons.math3.stat.regression.SimpleRegression;

import javax.swing.JFrame;
import javax.swing.WindowConstants;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class SurfactantAnalysis {
  String inchi;
  logPPlugin plugin = new logPPlugin();
  MajorMicrospeciesPlugin microspeciesPlugin = new MajorMicrospeciesPlugin();

  Molecule mol;
  // MolAtom objects don't seem to record their index in the parent molecule, so we'll build a mapping here.
  Map<MolAtom, Integer> atomToIndexMap = new HashMap<>();

  // Atom indices for the longest vector between any two atoms in the molecule.
  Integer lvIndex1;
  Integer lvIndex2;
  // Coordinates with lvIndex1 treated as the origin.
  List<DPoint3> normalizedCoordinates;
  Map<Integer, Double> distancesFromLongestVector = new HashMap<>();
  Map<Integer, Double> distancesAlongLongestVector = new HashMap<>();
  Map<Integer, Plane> normalPlanes = new HashMap<>();

  // Atoms with max/min logP values.
  Integer maxLogPIndex;
  Integer minLogPIndex;

  public enum FEATURES {
    // Whole-molecule features
    LOGP_TRUE,

    //TODO:
    //LOGD_7_4,
    //LOGD_2_5,
    //LOGD_RATIO,

    // Plane split features
    PS_LEFT_MEAN_LOGP,
    PS_RIGHT_MEAN_LOGP,
    PS_LR_SIZE_DIFF_RATIO,
    PS_LR_POS_NEG_RATIO_1, // Left neg / right pos
    PS_LR_POS_NEG_RATIO_2, // Right neg / left pos
    PS_ABS_LOGP_DIFF,
    PS_ABS_LOGP_SIGNS_DIFFER,
    PS_WEIGHTED_LOGP_DIFF,
    PS_WEIGHTED_LOGP_SIGNS_DIFFER,
    PS_MAX_ABS_DIFF, // This should be equivalent to the old split metric from the DARPA report (I hope).
    PS_LEFT_POS_NEG_RATIO,
    PS_RIGHT_POS_NEG_RATIO,

    // Regression features
    REG_WEIGHTED_SLOPE,
    REG_WEIGHTED_INTERCEPT,
    REG_VAL_AT_FARTHEST_POINT,
    REG_CROSSES_X_AXIS,
    REG_ABS_SLOPE,

    // Geometric features,
    GEO_LV_FD_RATIO,

    // Extreme neighborhood features
    NBH_MAX_AND_MIN_TOGETHER,
    NBH_MAX_IN_V1,
    NBH_MAX_IN_V2,
    NBH_MIN_IN_V1,
    NBH_MIN_IN_V2,
    NBH_MAX_N_MEAN,
    NBH_MIN_N_MEAN,
    NBH_MAX_POS_RATIO,
    NBH_MIN_NEG_RATIO,

    // Solubility features
    SOL_MG_ML_25,
    SOL_MG_ML_30,
    SOL_MG_ML_35,

    // pKa features
    PKA_ACID_1, PKA_ACID_1_IDX,
    PKA_ACID_2, PKA_ACID_2_IDX,
    PKA_ACID_3, PKA_ACID_3_IDX,
    PKA_BASE_1, PKA_BASE_1_IDX,
    PKA_BASE_2, PKA_BASE_2_IDX,
    PKA_BASE_3, PKA_BASE_3_IDX,

    // HBL features
    HLB_VAL,
  }

  public SurfactantAnalysis() { }

  /**
   * Imports a molecule and runs essential calculations (like logP).
   * @param inchi The InChI of a molecule to be imported.
   * @throws MolFormatException
   * @throws PluginException
   * @throws IOException
   */
  public void init(String inchi)
      throws MolFormatException, PluginException, IOException {
    this.inchi = inchi;
    Molecule importMol = MolImporter.importMol(this.inchi);
    Cleaner.clean(importMol, 3); // This will assign 3D atom coordinates to the MolAtoms in this.mol.
    plugin.standardize(importMol);

    // Note: this doesn't seem to have any effect, but we'll try anyway for our current use case.
    microspeciesPlugin.setpH(1.5);
    microspeciesPlugin.setMolecule(importMol);
    microspeciesPlugin.run();
    Molecule phMol = microspeciesPlugin.getMajorMicrospecies();

    plugin.setlogPMethod(LogPMethod.CONSENSUS);

    // TODO: do we need to explicitly specify ion concentration?
    plugin.setUserTypes("logPTrue,logPMicro,logPNonionic"); // These arguments were chosen via experimentation.

    plugin.setMolecule(phMol);
    plugin.run();
    this.mol = plugin.getResultMolecule();

    // The logP values exposed by the plugin are only accessible by index; make an object -> id map for easier lookup.
    MolAtom[] molAtoms = mol.getAtomArray();
    for (int i = 0; i < molAtoms.length; i++) {
      atomToIndexMap.put(molAtoms[i], i);
    }
  }

  /**
   * Finds the pair of most distant atoms that contribute to the molecule's logP value.
   * @return A pair of atom indices for the two most distant atoms in the molecule.
   */
  public Pair<Integer, Integer> findFarthestContributingAtomPair() {
    Double maxDist = 0.0d;
    Integer di1 = null, di2 = null; // Endpoint atoms of the diameter of the structure.
    for (int i = 0; i < mol.getAtomCount(); i++) {
      if (Double.isNaN(plugin.getAtomlogPIncrement(i))) {
        continue;
      }
      for (int j = 0; j < mol.getAtomCount(); j++) {
        if (i == j) {
          continue;
        }
        if (Double.isNaN(plugin.getAtomlogPIncrement(j))) {
          continue;
        }

        MolAtom m1 = mol.getAtom(i);
        MolAtom m2 = mol.getAtom(j);

        DPoint3 c1 = m1.getLocation();
        DPoint3 c2 = m2.getLocation();

        Double dist = c1.distance(c2);

        if (dist > maxDist) {
          maxDist = dist;
          di1 = i;
          di2 = j;
        }
      }
    }

    this.lvIndex1 = di1;
    this.lvIndex2 = di2;

    this.normalizedCoordinates = resetOriginForCoordinates(di1);

    return Pair.of(di1, di2);
  }

  /**
   * Compute the distance between two atoms in the molecule being analyzed.
   * @param a1 The index of one atom.
   * @param a2 The index of the other atom.
   * @return A distance (units not specified) between the two atoms in the molecule's coordinate space.
   */
  public Double computeDistance(Integer a1, Integer a2) {
    return this.normalizedCoordinates.get(a1).distance(this.normalizedCoordinates.get(a2));
  }

  /**
   * Recenters all atomic coordinates around a new origin.
   * @param newOriginIndex The atom index to use as a new origin.
   * @return A list of coordinates for all atoms using the specified atom as the origin.
   */
  public List<DPoint3> resetOriginForCoordinates(Integer newOriginIndex) {
    DPoint3 newOrigin = mol.getAtom(newOriginIndex).getLocation();
    List<DPoint3> coords = new ArrayList<>();
    for (int i = 0; i < mol.getAtomCount(); i++) {
      DPoint3 c = mol.getAtom(i).getLocation();
      c.subtract(newOrigin);
      coords.add(c);
    }
    return coords;
  }

  public static class Plane {
    public double a;
    public double b;
    public double c;
    public double d;

    public Plane(double a, double b, double c, double d) {
      this.a = a;
      this.b = b;
      this.c = c;
      this.d = d;
    }

    public double computeProductForPoint(double x, double y, double z) {
      return a * x + b * y + c * z + d;
    }
  }

  /**
   * Computes an atom's projection onto `lv` and the `lv`-normal plane that intersects that projection, where `lv` is
   * the vector between the pair of most distant atoms in the molecule.
   *
   * @return Maps of atomic indices to distances from `lv` and to an `lv`-normal plane that intersects that molecule.
   */
  public Pair<Map<Integer, Double>, Map<Integer, Plane>> computeAtomDistanceToLongestVectorAndNormalPlanes() {
    List<DPoint3> coords = this.normalizedCoordinates;
    for (int i = 0; i < mol.getAtomCount(); i++) {
      if (i == lvIndex1 || i == lvIndex2) {
        continue;
      }

      DPoint3 diameter = coords.get(lvIndex2);
      DPoint3 exp = coords.get(i);

      Double dotProduct = diameter.x * exp.x + diameter.y * exp.y + diameter.z * exp.z;
      Double lengthProduct = Math.sqrt(diameter.lengthSquare()) * Math.sqrt(exp.lengthSquare());
      Double cosine = dotProduct / lengthProduct;
      Double sine = Math.sqrt(1 - cosine * cosine);
      Double vLength = Math.sqrt(exp.lengthSquare());

      Double perpendicularDist = sine * vLength;

      Double proj = cosine * vLength;

      distancesFromLongestVector.put(i, perpendicularDist);
      distancesAlongLongestVector.put(i, proj);
      normalPlanes.put(i, new Plane(diameter.x, diameter.y, diameter.z, -1d * dotProduct));
    }

    distancesFromLongestVector.put(lvIndex1, 0.0);
    distancesFromLongestVector.put(lvIndex2, 0.0);

    distancesAlongLongestVector.put(lvIndex1, 0.0);
    distancesAlongLongestVector.put(lvIndex2, Math.sqrt(coords.get(lvIndex2).lengthSquare()));

    return Pair.of(distancesFromLongestVector, normalPlanes);
  }

  /**
   * Computes sets of atoms on either side of each `lv`-normal plane defined by each atom.
   * @return A map of atom index to lists of atoms on each side of the atom-incident `lv`-normal plane.
   */
  public Map<Integer, Pair<List<Integer>, List<Integer>>> splitAtomsByNormalPlanes() {
    List<DPoint3> coords = resetOriginForCoordinates(lvIndex1);
    Map<Integer, Pair<List<Integer>, List<Integer>>> results = new HashMap<>();

    for (int i = 0; i < mol.getAtomCount(); i++) {
      Plane p = normalPlanes.get(i);
      if (p == null) {
        continue;
      }

      List<Integer> negSide = new ArrayList<>();
      List<Integer> posSide = new ArrayList<>();

      for (int j = 0; j < mol.getAtomCount(); j++) {
        if (i == j) {
          continue;
        }
        DPoint3 c = coords.get(j);
        double prod = p.computeProductForPoint(c.x, c.y, c.z);
        // It seems unlikely that an atom would be coplanar to the dividing atom, but who knows.  Throw it in pos if so.
        if (prod < 0.0000d) {
          negSide.add(j);
        } else {
          posSide.add(j);
        }
      }
      results.put(i, Pair.of(negSide, posSide));
    }

    return results;
  }

  /**
   * Computes the minimum bounding ball around a list of coordinates.
   * @param coords A list of coordinates whose minimum bounding ball to compute.
   * @return A center and radius of the minimum bounding ball for the specified list of points.
   */
  public Pair<DPoint3, Double> computeMinimumBoundingBall(List<DPoint3> coords) {
    ArrayPointSet aps = new ArrayPointSet(3, coords.size());
    for (int i = 0; i < coords.size(); i++) {
      DPoint3 c = coords.get(i);
      aps.set(i, 0, c.x);
      aps.set(i, 1, c.y);
      aps.set(i, 2, c.z);
    }

    Miniball mb = new Miniball(aps);
    double[] c = mb.center();
    DPoint3 center = new DPoint3(c[0], c[1], c[2]);
    return Pair.of(center, mb.radius());
  }

  /**
   * Contribute the minimum bounding ball for all atoms that contribute the the molecule's logP value.
   * @return A center and raidus for the minimum bounding ball around logP-contributing atoms.
   */
  public Pair<DPoint3, Double> computeMinimumBoundingBallForContributingAtoms() {
    MolAtom[] atoms = mol.getAtomArray();
    List<DPoint3> coords = new ArrayList<>(atoms.length);
    for (int i = 0; i < atoms.length; i++) {
      // Ignore atoms that don't contribute to the logP value (i.e. have a NaN LogP value).
      if (Double.isNaN(plugin.getAtomlogPIncrement(i))) {
        continue;
      }
      coords.add(atoms[i].getLocation());
    }
    return computeMinimumBoundingBall(coords);
  }

  /**
   * Explore the neighborhood within `depths` steps of the atom with the specified atomic index, returning a map of
   * neighboring atomic indices to their step-wise distance from the specified origin atom.
   *
   * @param index The index of the atom whose neighborhood to explore.
   * @param depth The maximum number of steps to take away from the origin atom.
   * @return A map of atomic index to step-wise distance from the specified origin atom.
   */
  public Map<Integer, Integer> exploreNeighborhood(int index, int depth) {
    return exploreNeighborhoodHelper(index, depth, depth, new HashMap<>());
  }

  // Recursively walk the atom's neighborhood.
  private Map<Integer, Integer> exploreNeighborhoodHelper(int index, int baseDepth, int depth,
                                                          Map<Integer, Integer> atomsAndDepths) {
    if (!atomsAndDepths.containsKey(index)) {
      atomsAndDepths.put(index, baseDepth - depth);
    }

    if (depth <= 0) {
      return atomsAndDepths;
    }

    MolAtom d1 = mol.getAtom(index);
    MolBond[] d1bonds = d1.getBondArray();
    for (MolBond b : d1bonds) {
      MolAtom dest;
      if (b.getAtom1().equals(d1)) {
        dest = b.getAtom2();
      } else {
        dest = b.getAtom1();
      }

      int desti = atomToIndexMap.get(dest);

      if (!atomsAndDepths.containsKey(desti)) {
        atomsAndDepths = exploreNeighborhoodHelper(desti,baseDepth, depth - 1, atomsAndDepths);
      }
    }
    return atomsAndDepths;
  }

  public static final Double MIN_AND_MAX_LOG_P_LONGEST_VECTOR_BOOST = 0.00001;
  /**
   * Walk bonds from the lv endpoints and min/max logP atoms, computing stats about their makeup.
   *
   * @return A map of features to numeric values for extreme-neighborhood type attributes (NBH_*).
   */
  public Map<FEATURES, Double> exploreExtremeNeighborhoods() {
    Integer vMax = null, vMin = null;
    double lpMax = 0.0, lpMin = 0.0;
    for (int i = 0; i < mol.getAtomCount(); i++) {
      double lp = plugin.getAtomlogPIncrement(i);
      if (i == lvIndex1 || i == lvIndex2) {
        // Boost the most distant points by a little bit to break ties.
        lp = lp > 0.0 ? lp + MIN_AND_MAX_LOG_P_LONGEST_VECTOR_BOOST : lp - MIN_AND_MAX_LOG_P_LONGEST_VECTOR_BOOST;
      }
      if (vMax == null || lp > lpMax) {
        vMax = i;
        lpMax = lp;
      }

      if (vMin == null || lp < lpMin) {
        vMin = i;
        lpMin = lp;
      }
    }

    maxLogPIndex = vMax;
    minLogPIndex = vMin;

    Map<Integer, Integer> maxNeighborhood = exploreNeighborhood(vMax, 2);
    Map<Integer, Integer> minNeighborhood = exploreNeighborhood(vMin, 2);

    Map<Integer, Integer> v1Neighborhood = exploreNeighborhood(lvIndex1, 2);
    Map<Integer, Integer> v2Neighborhood = exploreNeighborhood(lvIndex2, 2);

    boolean maxAndMinInSimilarNeighborhood = maxNeighborhood.containsKey(vMin);
    boolean maxInV1N = v1Neighborhood.containsKey(vMax);
    boolean maxInV2N = v2Neighborhood.containsKey(vMax);
    boolean minInV1N = v1Neighborhood.containsKey(vMin);
    boolean minInV2N = v2Neighborhood.containsKey(vMin);

    // These odd *_ accumulators are because the vars used in the put() calls for the return value need to be final.
    double maxNSum_ = 0.0;
    int maxNWithPosSign_ = 0;
    for (Integer i : maxNeighborhood.keySet()) {
      double logp = plugin.getAtomlogPIncrement(i);
      maxNSum_ += logp;
      if (logp >= 0.0) {
        maxNWithPosSign_++;
      }
    }
    double maxNSum = maxNSum_;
    double maxNWithPosSign = Integer.valueOf(maxNWithPosSign_).doubleValue();

    double minNSum_ = 0.0;
    int minNWithNegSign_ = 0;
    for (Integer i : minNeighborhood.keySet()) {
      double logp = plugin.getAtomlogPIncrement(i);
      minNSum_ += logp;
      if (logp <= 0.0) {
        minNWithNegSign_++;
      }
    }
    double minNSum = minNSum_;
    double minNWithNegSign = Integer.valueOf(minNWithNegSign_).doubleValue();

    return new HashMap<FEATURES, Double>() {{
      put(FEATURES.NBH_MAX_AND_MIN_TOGETHER, maxAndMinInSimilarNeighborhood ? 1.0 : 0);
      put(FEATURES.NBH_MAX_IN_V1, maxInV1N ? 1.0 : 0); // Boolean -> float makes this friendly to downstream analysis.
      put(FEATURES.NBH_MAX_IN_V2, maxInV2N ? 1.0 : 0);
      put(FEATURES.NBH_MIN_IN_V1, minInV1N ? 1.0 : 0);
      put(FEATURES.NBH_MIN_IN_V2, minInV2N ? 1.0 : 0);
      put(FEATURES.NBH_MAX_N_MEAN, maxNSum / Integer.valueOf(maxNeighborhood.size()).doubleValue());
      put(FEATURES.NBH_MIN_N_MEAN, minNSum / Integer.valueOf(maxNeighborhood.size()).doubleValue());
      put(FEATURES.NBH_MAX_POS_RATIO, maxNWithPosSign / Integer.valueOf(maxNeighborhood.size()).doubleValue());
      put(FEATURES.NBH_MIN_NEG_RATIO, minNWithNegSign / Integer.valueOf(minNeighborhood.size()).doubleValue());
    }};
  }

  /**
   * Perform linear regression over atoms' projection onto `lv` using their logP contributions as y-axis values.
   *
   * @return The slope of the regression line computed over the `lv`-projection.
   */
  public Double performRegressionOverLVProjectionOfLogP() {
    SimpleRegression regression = new SimpleRegression();
    for (int i = 0; i < mol.getAtomCount(); i++) {
      Double x = distancesAlongLongestVector.get(i);
      Double y = plugin.getAtomlogPIncrement(i);
      regression.addData(x, y);
    }
    regression.regress();
    return regression.getSlope();
  }

  /**
   * Perform linear regression over a list of X/Y coordinates
   * @param coords A set of coordinates over which to perform linear regression.
   * @return The slope and intercept of the regression line.
   */
  public Pair<Double, Double> performRegressionOverXYPairs(List<Pair<Double, Double>> coords) {
    SimpleRegression regression = new SimpleRegression(true);
    for (Pair<Double, Double> c : coords) {
      regression.addData(c.getLeft(), c.getRight());
    }
    // Note: the regress() call can raise an exception for small molecules.  We should probably handle that gracefully.
    RegressionResults result = regression.regress();
    return Pair.of(regression.getSlope(), regression.getIntercept());
  }

  /**
   * Computes plane-split (PS_*_) features for a list of AtomSplit objects, and returns the one that best separates
   * positivie and negative logP-contributing atoms.
   * @param atomSplits A list of atom splits for which to compute features.
   * @return A pair of the best AtomSplit and its features.
   */
  public Pair<AtomSplit, Map<FEATURES, Double>> findBestPlaneSplitFeatures(List<AtomSplit> atomSplits) {
    double bestWeightedLogPDiff = 0.0;
    AtomSplit bestAtomSplit = null;
    Map<FEATURES, Double> features = null;
    // Compute a bunch of metrics for every split, and take the one that best partitions the weighted logP delta.
    for (AtomSplit ps : atomSplits) {
      double absLogPDiff = Math.abs(ps.getLeftSum() - ps.getRightSum());
      double absLogPSignDiff = ps.getLeftSum() * ps.getRightSum() < 0.000 ? 1.0 : 0.0;
      double absLogPMinMaxDiff = Math.max(
          ps.getLeftMax()  - ps.getRightMin(),
          ps.getRightMax() - ps.getLeftMin());
      double weightedLogPDiff = Math.abs(ps.getWeightedLeftSum() - ps.getWeightedRightSum());
      double weightedLogPSignDiff = ps.getWeightedLeftSum() * ps.getWeightedRightSum() < 0.000 ? 1.0 : 0.0;
      int leftSize = ps.getLeftIndices().size();
      int rightSize = ps.getRightIndices().size();
      double lrSetSizeDiffRatio = Math.abs(Integer.valueOf(leftSize - rightSize).doubleValue() /
          Integer.valueOf(leftSize + rightSize).doubleValue());
      double sizeWeightedLeftSum = ps.getLeftSum() / Integer.valueOf(Math.max(leftSize, 1)).doubleValue();
      double sizeWeightedRightSum = ps.getRightSum() / Integer.valueOf(Math.max(rightSize, 1)).doubleValue();
      double sizeWeightedLeftWeightedSum =
          ps.getWeightedLeftSum() / Integer.valueOf(Math.max(leftSize, 1)).doubleValue();
      double sizeWeightedRightWeightedSum =
          ps.getWeightedRightSum() / Integer.valueOf(Math.max(rightSize, 1)).doubleValue();
      double lrPosNegCountRatio1 = Integer.valueOf(ps.getLeftNegCount()).doubleValue() /
          Integer.valueOf(Math.max(ps.getRightPosCount(), 1)).doubleValue();
      double lrPosNegCountRatio2 = Integer.valueOf(ps.getRightNegCount()).doubleValue() /
          Integer.valueOf(Math.max(ps.getLeftPosCount(), 1)).doubleValue();
      double leftPosNegRatio = Integer.valueOf(Math.min(ps.getLeftNegCount(), ps.getLeftPosCount())).doubleValue() /
          Integer.valueOf(Math.max(ps.getLeftNegCount(), ps.getLeftPosCount())).doubleValue();
      double rightPosNegRatio = Integer.valueOf(Math.min(ps.getRightNegCount(), ps.getRightPosCount())).doubleValue() /
          Integer.valueOf(Math.max(ps.getRightNegCount(), ps.getRightPosCount())).doubleValue();

      if (weightedLogPDiff > bestWeightedLogPDiff) {
        bestWeightedLogPDiff = weightedLogPDiff;
        bestAtomSplit = ps;

        // Store the features while they're computed; seems like it'd be more expensive to recompute than store.
        features = new HashMap<FEATURES, Double>(){{
          put(FEATURES.PS_LEFT_MEAN_LOGP, ps.getLeftSum() / Integer.valueOf(Math.max(leftSize, 1)).doubleValue());
          put(FEATURES.PS_RIGHT_MEAN_LOGP, ps.getRightSum() / Integer.valueOf(Math.max(rightSize, 1)).doubleValue());
          put(FEATURES.PS_LR_SIZE_DIFF_RATIO, lrSetSizeDiffRatio);
          put(FEATURES.PS_LR_POS_NEG_RATIO_1, lrPosNegCountRatio1);
          put(FEATURES.PS_LR_POS_NEG_RATIO_2, lrPosNegCountRatio2);
          put(FEATURES.PS_ABS_LOGP_DIFF, absLogPDiff);
          put(FEATURES.PS_ABS_LOGP_SIGNS_DIFFER, absLogPSignDiff);
          put(FEATURES.PS_WEIGHTED_LOGP_DIFF, weightedLogPDiff);
          put(FEATURES.PS_WEIGHTED_LOGP_SIGNS_DIFFER, weightedLogPSignDiff);
          put(FEATURES.PS_MAX_ABS_DIFF, absLogPMinMaxDiff);
          put(FEATURES.PS_LEFT_POS_NEG_RATIO, leftPosNegRatio);
          put(FEATURES.PS_RIGHT_POS_NEG_RATIO, rightPosNegRatio);
          // TODO: add surface-contribution-based metrics as well.
        }};
      }
    }
    return Pair.of(bestAtomSplit, features);
  }

  /**
   * Compute features related to the logP-labeled molecular surface computed by MarvinSpace.
   * @param jFrame A jFrame to use when running MarvinSpace (seems strange but is requred).
   * @param hydrogensShareNeighborsLogP Set to true if hydrogen atoms should share their neighbor's logP value.
   * @return A map of features related to and depending on the computed molecular surface.
   * @throws Exception
   */
  public Map<FEATURES, Double> computeSurfaceFeatures(JFrame jFrame, boolean hydrogensShareNeighborsLogP)
      throws Exception {
    // TODO: use the proper marvin sketch scene to get better rendering control instead of MSpaceEasy.
    MSpaceEasy mspace = new MSpaceEasy(1, 2, true);
    mspace.addCanvas(jFrame.getContentPane());
    mspace.setSize(1200, 600);

    ArrayList<Double> logPVals = new ArrayList<>();
    ArrayList<Double> hValues = new ArrayList<>();
    // Store a list of ids so we can label the atoms in the surface rendering (otherwise we won't know what's what).
    ArrayList<Integer> ids = new ArrayList<>();
    MolAtom[] atoms = mol.getAtomArray();
    for (int i = 0; i < mol.getAtomCount(); i++) {
      ids.add(i);
      Double logP = plugin.getAtomlogPIncrement(i);
      logPVals.add(logP);

      /* The surface renderer requires that we specify logP values for all hydrogens, which don't appear to have logP
       * contributions calculated for them, in addition to non-hydrogen atoms.  We fake this by either borrowing the
       * hydrogen's neighbor's logP value, or setting it to 0.0.
       * TODO: figure out what the command-line marvin sketch logP renderer does and do that instead.
       * */
      MolAtom molAtom = mol.getAtom(i);
      for (int j = 0; j < molAtom.getImplicitHcount(); j++) {
        // Note: the logPPlugin's deprecated getAtomlogPHIncrement method just uses the non-H neighbor's logP, as here.
        // msketch seems to do something different, but it's unclear what that is.
        hValues.add(hydrogensShareNeighborsLogP ? logP : 0.0);
      }
    }
    /* Tack the hydrogen's logP contributions on to the list of proper logP values.  The MSC renderer seems to expect
     * the hydrogen's values after the non-hydrogen's values, so appending appears to work fine. */
    logPVals.addAll(hValues);

    // Compute the planes before rendering to avoid the addition of implicit hydrogens in the calculation.
    // TODO: re-strip hydrogens after rendering to avoid these weird issues in general.
    Map<Integer, Pair<List<Integer>, List<Integer>>> splitPlanes = splitAtomsByNormalPlanes();

    MoleculeComponent mc1 = mspace.addMoleculeTo(mol, 0);
    mspace.getEventHandler().createAtomLabels(mc1, ids);

    // Don't draw hydrogens; it makes the drawing too noisy.
    mspace.setProperty("MacroMolecule.Hydrogens", "false");
    MoleculeComponent mc2 = mspace.addMoleculeTo(mol, 1);
    MolecularSurfaceComponent msc = mspace.computeSurface(mc2);
    SurfaceComponent sc = msc.getSurface();

    // Note: if we call mol.getAtomArray() here, it will contain all the implicit hydrogens.
    Map<Integer, Integer> surfaceComponentCounts = new HashMap<>();
    for (int i = 0; i < atoms.length; i++) {
      surfaceComponentCounts.put(i, 0);
    }
    for (int i = 0; i < sc.getVertexCount(); i++) {
      DPoint3 c = new DPoint3(sc.getVertexX(i), sc.getVertexY(i), sc.getVertexZ(i));
      Double closestDist = null;
      Integer closestAtom = null;
      for (int j = 0; j < atoms.length; j++) {
        double dist = c.distance(atoms[j].getLocation());
        if (closestDist == null || closestDist > dist) {
          closestDist = dist;
          closestAtom = j;
        }
      }
      surfaceComponentCounts.put(closestAtom, surfaceComponentCounts.get(closestAtom) + 1);
    }

    // Build a line of (proj(p, lv), logP) pairs.
    List<Pair<Double, Double>> weightedVals = new ArrayList<>();
    for (int i = 0; i < atoms.length; i++) {
      Integer count = surfaceComponentCounts.get(i);
      Double logP = plugin.getAtomlogPIncrement(i);
      Double x = distancesAlongLongestVector.get(i);
      Double y = count.doubleValue() * logP;
      // Ditch non-contributing atoms.
      if (y < -0.001 || y > 0.001) {
        weightedVals.add(Pair.of(x, y));
      }
    }
    Collections.sort(weightedVals);

    Pair<Double, Double> slopeIntercept = performRegressionOverXYPairs(weightedVals);
    double valAtFarthestPoint =
        distancesAlongLongestVector.get(lvIndex2) * slopeIntercept.getLeft() + slopeIntercept.getRight();

    Map<FEATURES, Double> features = new HashMap<>();
    features.put(FEATURES.REG_WEIGHTED_SLOPE, slopeIntercept.getLeft());
    features.put(FEATURES.REG_WEIGHTED_INTERCEPT, slopeIntercept.getRight());
    features.put(FEATURES.REG_VAL_AT_FARTHEST_POINT, valAtFarthestPoint);
    /* Multiply the intercept with the value at the largest point to see if there's a sign change.  If so, we'll
     * get a negative number and know the regression line crosses the axis. */
    features.put(FEATURES.REG_CROSSES_X_AXIS, valAtFarthestPoint * slopeIntercept.getRight() < 0.000 ? 1.0 : 0.0);

    // Flatten the list of split planes and find the "best" one (i.e. the one that maximizes the weighted logP delta).
    List<AtomSplit> allSplitPlanes = new ArrayList<>();
    for (int i = 0; i < atoms.length; i++) {
      if (!splitPlanes.containsKey(i)) {
        continue;
      }
      Pair<List<Integer>, List<Integer>> splitAtoms = splitPlanes.get(i);
      List<Integer> leftAtoms = splitAtoms.getLeft();
      List<Integer> rightAtoms = splitAtoms.getRight();
      Pair<AtomSplit, AtomSplit> splitVariants = AtomSplit.computePlaneSplitsForIntersectingAtom(
          leftAtoms, rightAtoms, i, plugin, surfaceComponentCounts
      );

      AtomSplit l = splitVariants.getLeft();
      AtomSplit r = splitVariants.getRight();
      allSplitPlanes.add(l);
      allSplitPlanes.add(r);
    }
    Pair<AtomSplit, Map<FEATURES, Double>> bestPsRes = findBestPlaneSplitFeatures(allSplitPlanes);
    features.putAll(bestPsRes.getRight());

    msc.setPalette(SurfaceColoring.COLOR_MAPPER_BLUE_TO_RED);
    msc.showVolume(true);
    // These parameters were selected via experimentation.
    msc.setSurfacePrecision("High");
    msc.setSurfaceType("van der Waals");
    msc.setDrawProperty("Surface.DrawType", "Dot");
    msc.setDrawProperty("Surface.Quality", "High");
    msc.setAtomPropertyList(logPVals);
    msc.setDrawProperty("Surface.ColorType", "AtomProperty");

    // Don't display here--leave that to the owner of the JFrame.
    return features;
  }

  public static final double[] SOLUBILITY_PHS = new double[] {2.5, 3.0, 3.5};
  /**
   * Calculate whole-molecule fatures used in post-processing and filtering.
   * @return A map of whole-molecule features.
   * @throws Exception
   */
  public Map<FEATURES, Double> calculateAdditionalFilteringFeatures() throws Exception {
    SolubilityCalculator sc = new SolubilityCalculator();
    SolubilityResult[] solubility = sc.calculatePhDependentSolubility(mol, SOLUBILITY_PHS);

    HlbPlugin hlb = HlbPlugin.Builder.createNew();
    hlb.setMolecule(mol);
    hlb.run();
    double hlbVal = hlb.getHlbValue();

    pKaPlugin pka = new pKaPlugin();
    // From the documentation.  Not sure what these knobs do...
    pka.setBasicpKaLowerLimit(-5.0);
    pka.setAcidicpKaUpperLimit(25.0);
    pka.setpHLower(2.5); // for ms distr
    pka.setpHUpper(3.5); // for ms distr
    pka.setpHStep(0.5);  // for ms distr
    pka.setMolecule(mol);
    pka.run();

    double[] pkaAcidVals = new double[3];
    int[] pkaAcidIndices = new int[3];

    double[] pkaBasicVals = new double[3];
    int[] pkaBasicIndices = new int[3];

    // Also not sure these are the values we're interested in.
    pka.getMacropKaValues(pKaPlugin.ACIDIC, pkaAcidVals, pkaAcidIndices);
    pka.getMacropKaValues(pKaPlugin.BASIC, pkaBasicVals, pkaBasicIndices);

    // TODO: compute carbon chain length.
    return new HashMap<FEATURES, Double>() {{
      put(FEATURES.SOL_MG_ML_25, solubility[0].getSolubility(SolubilityUnit.MGPERML));
      put(FEATURES.SOL_MG_ML_30, solubility[1].getSolubility(SolubilityUnit.MGPERML));
      put(FEATURES.SOL_MG_ML_35, solubility[2].getSolubility(SolubilityUnit.MGPERML));

      put(FEATURES.PKA_ACID_1, pkaAcidVals[0]);
      put(FEATURES.PKA_ACID_1_IDX, Integer.valueOf(pkaAcidIndices[0]).doubleValue());
      put(FEATURES.PKA_ACID_2, pkaAcidVals[1]);
      put(FEATURES.PKA_ACID_2_IDX, Integer.valueOf(pkaAcidIndices[1]).doubleValue());
      put(FEATURES.PKA_ACID_3, pkaAcidVals[2]);
      put(FEATURES.PKA_ACID_3_IDX, Integer.valueOf(pkaAcidIndices[2]).doubleValue());

      put(FEATURES.PKA_BASE_1, pkaBasicVals[0]);
      put(FEATURES.PKA_BASE_1_IDX, Integer.valueOf(pkaBasicIndices[0]).doubleValue());
      put(FEATURES.PKA_BASE_2, pkaBasicVals[1]);
      put(FEATURES.PKA_BASE_2_IDX, Integer.valueOf(pkaBasicIndices[1]).doubleValue());
      put(FEATURES.PKA_BASE_3, pkaBasicVals[2]);
      put(FEATURES.PKA_BASE_3_IDX, Integer.valueOf(pkaBasicIndices[2]).doubleValue());

      put(FEATURES.HLB_VAL, hlbVal);
    }};
  }

  public String getInchi() {
    return inchi;
  }

  public logPPlugin getPlugin() {
    return plugin;
  }

  public Molecule getMol() {
    return mol;
  }

  public Map<MolAtom, Integer> getAtomToIndexMap() {
    return atomToIndexMap;
  }

  public Integer getLvIndex1() {
    return lvIndex1;
  }

  public Integer getLvIndex2() {
    return lvIndex2;
  }

  public List<DPoint3> getNormalizedCoordinates() {
    return normalizedCoordinates;
  }

  public MajorMicrospeciesPlugin getMicrospeciesPlugin() {
    return microspeciesPlugin;
  }

  public Map<Integer, Double> getDistancesFromLongestVector() {
    return distancesFromLongestVector;
  }

  public Map<Integer, Double> getDistancesAlongLongestVector() {
    return distancesAlongLongestVector;
  }

  public Map<Integer, Plane> getNormalPlanes() {
    return normalPlanes;
  }

  // TODO: add greedy high/low logP neighborhood picking, compute bounding balls, and calc intersection (spherical cap).
  // TODO: restructure this class to make the analysis steps more modular (now they're coupled to surface computation).
  /**
   * Perform all analysis for a molecule, returning a map of all available features.
   * @param inchi The molecule to analyze.
   * @param display True if the molecule should be displayed; set to false for non-interactive analysis.
   * @return A map of all features for this molecule.
   * @throws Exception
   */
  public static Map<FEATURES, Double> performAnalysis(String inchi, boolean display) throws Exception {
    SurfactantAnalysis surfactantAnalysis = new SurfactantAnalysis();
    surfactantAnalysis.init(inchi);

    // Start with simple structural analyses.
    Pair<Integer, Integer> farthestAtoms = surfactantAnalysis.findFarthestContributingAtomPair();
    Double longestVectorLength = surfactantAnalysis.computeDistance(farthestAtoms.getLeft(), farthestAtoms.getRight());

    // Then compute the atom distances to the longest vector (lv) and produce lv-normal planes at each atom.
    Pair<Map<Integer, Double> , Map<Integer, Plane>> results =
        surfactantAnalysis.computeAtomDistanceToLongestVectorAndNormalPlanes();
    // Find the max distance so we can calculate the maxDist/|lv| ratio, or "skinny" factor.
    double maxDistToLongestVector = 0.0;
    Map<Integer, Double> distancesToLongestVector = results.getLeft();
    for (Map.Entry<Integer, Double> e : distancesToLongestVector.entrySet()) {
      maxDistToLongestVector = Math.max(maxDistToLongestVector, e.getValue());
    }

    // A map of the molecule features we'll eventually output.
    Map<FEATURES, Double> features = new HashMap<>();

    // Explore the lv endpoint and min/max logP atom neighborhoods, and merge those features into the complete map.
    Map<FEATURES, Double> neighborhoodFeatures = surfactantAnalysis.exploreExtremeNeighborhoods();
    features.putAll(neighborhoodFeatures);

    /* Perform regression analysis on the projection of the molecules onto lv, where their y-axis is their logP value.
     * Higher |slope| may mean more extreme logP differences at the ends. */
    Double slope = surfactantAnalysis.performRegressionOverLVProjectionOfLogP();

    /* Compute the logP surface of the molecule (seems to require a JFrame?), and collect those features.  We consider
     * the number of closest surface components to each atom so we can guess at how much interior atoms actually
     * contribute to the molecule's solubility. */
    JFrame jFrame = new JFrame();
    jFrame.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE);
    Map<FEATURES, Double> surfaceFeatures = surfactantAnalysis.computeSurfaceFeatures(jFrame, true);
    features.putAll(surfaceFeatures);

    features.put(FEATURES.LOGP_TRUE, surfactantAnalysis.plugin.getlogPTrue()); // Save absolute logP since we calculated it.
    features.put(FEATURES.GEO_LV_FD_RATIO, maxDistToLongestVector / longestVectorLength);
    features.put(FEATURES.REG_ABS_SLOPE, slope);

    Map<FEATURES, Double> additionalFeatures = surfactantAnalysis.calculateAdditionalFilteringFeatures();
    features.putAll(additionalFeatures);

    List<FEATURES> sortedFeatures = new ArrayList<>(features.keySet());
    Collections.sort(sortedFeatures);

    // Print these for easier progress tracking.
    System.out.format("features:\n");
    for (FEATURES f : sortedFeatures) {
      System.out.format("  %s = %f\n", f, features.get(f));
    }

    if (display) {
      jFrame.pack();
      jFrame.setVisible(true);
    }

    return features;
  }
}