SarTree.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.sarinference;

import chemaxon.clustering.LibraryMCS;
import chemaxon.formats.MolFormatException;
import chemaxon.struc.Molecule;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import java.util.function.Consumer;

/**
 * Provides a nice interface over the results of a LibMCS clustering run.
 */
public class SarTree {

  private static final Logger LOGGER = LogManager.getFormatterLogger(SarTree.class);
  private static transient final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

  private static final Integer FIRST_ID = 1;
  private static final Boolean ALL_NODES = false; // Tell LibMCS to return all nodes in the tree in its clustering
  private static final String HIERARCHY_ID = "HierarchyID";

  private Map<String, SarTreeNode> nodeMap;

  public SarTree() {
    nodeMap = new HashMap<>();
  }

  public Collection<SarTreeNode> getNodes() {
    return nodeMap.values();
  }

  public void addNode(SarTreeNode node) {
    nodeMap.put(node.getHierarchyId(), node);
  }

  /**
   * Builds this SAR tree by running LibraryMCS clustering on the given set of molecules.
   *
   * @param libMcs The clusterer.
   * @param molecules The molecules.
   * @throws InterruptedException
   */
  public void buildByClustering(LibraryMCS libMcs, Collection<Molecule> molecules) throws InterruptedException {
    if (molecules.isEmpty()) {
      LOGGER.error("Tried to build clustering on no molecules! Aborting call");
      // Log error but don't throw error. This could be a piece in a workflow where we try to build SARs on a prediction
      // corpus with no valid predictions. In that case we should not kill the workflow, but let it run and produce no
      // results.
      return;
    }

    for (Molecule mol : molecules) {
      libMcs.addMolecule(mol);
    }
    LOGGER.info("Beginning libMCS clustering.");
    libMcs.search();

    LibraryMCS.ClusterEnumerator enumerator = libMcs.getClusterEnumerator(ALL_NODES);
    this.buildFromEnumerator(enumerator);
  }

  /**
   * Builds this SAR tree based on the results of a LibMCS clustering.
   *
   * @param enumerator An enumerator for the LibMCS clusters.
   */
  private void buildFromEnumerator(LibraryMCS.ClusterEnumerator enumerator) {

    Molecule molecule;

    // Sometimes, even when enumerator.hasNext() returns true, enumerator.next() returns, not a null value, but
    // a NullPointerException. This might merit further investigation as it is totally bizarre.
    while (enumerator.hasNext()) {
      try {
        molecule = enumerator.next();
      } catch (NullPointerException e) {
        LOGGER.error("Null pointer exception thrown internally by enumerator.next() : %s", e.getMessage());
        // Log but don't throw error. This doesn't seem to indicate a real problem, but just a quirk of LibMCS.
        return;
      }

      try {
        this.addNode(getSarTreeNodeFromEnumeratorOutput(molecule));
      } catch (MolFormatException e) {
        // This error only seems to happen when the enumerator has in fact run out of valid molecules.
        // For that reason, we return instead of continuing or throwing an exception.
        LOGGER.info("Exception on getting SarTreeNode, stopping SarTree build: %s", e.getMessage());
        return;
      }
    }
  }

  /**
   * Builds a SarTreeNode from the corresponding molecule output from the enumerator. Reads the HierarchyID as well as
   * the prediction ID property from the molecule, if it exists.
   *
   * @param molecule A molecule spit out by the ClusterEnumerator.
   * @return A SarTreeNode.
   */
  public SarTreeNode getSarTreeNodeFromEnumeratorOutput(Molecule molecule) throws MolFormatException {
    if (molecule.getPropertyObject(HIERARCHY_ID) == null) {
      throw new MolFormatException("Bizarre error: Libmcs returned us a molecule with no hierarchy ID.");
    }

    String hierId = molecule.getPropertyObject(HIERARCHY_ID).toString();
    List<Integer> predictionIds = new ArrayList<>();

    // A molecule has a RecoveryID iff it's a leaf node, by LibraryMCS's API. Return with no prediction IDs,
    // as a higher level SAR node does not correspond to any particular predictions.
    if (molecule.getPropertyObject("RecoveryID") == null) {
      return new SarTreeNode(molecule, hierId, predictionIds);
    }

    // If it is a leaf node, it should always have the PREDICTION_ID_KEY property object- we added it earlier!
    // Thus we should add those prediction IDs to the SarTreeNode.
    Object propertyObject = molecule.getPropertyObject(SarTreeNode.PREDICTION_ID_KEY);
    if (propertyObject != null) {
      try {
        predictionIds = Arrays.asList(OBJECT_MAPPER.readValue(propertyObject.toString(), Integer[].class));
      } catch (IOException e) {
        LOGGER.info("Couldn't deserialize %s into list : %s", propertyObject, e.getMessage());
        throw new RuntimeException(e);
      }
    }

    return new SarTreeNode(molecule, hierId, predictionIds);
  }

  public List<SarTreeNode> getRootNodes() {
    SarTreeNode node;
    int clusterNumber = FIRST_ID;

    List<SarTreeNode> result = new ArrayList<>();
    while ((node = nodeMap.get(Integer.toString(clusterNumber))) != null) {
      result.add(node);
      clusterNumber++;
    }

    return result;
  }

  public List<SarTreeNode> getChildren(SarTreeNode node) {
    int childNumber = FIRST_ID;
    SarTreeNode child;
    List<SarTreeNode> result = new ArrayList<>();

    while ((child = nodeMap.get(getChildHierarchyId(node, childNumber))) != null) {
      result.add(child);
      childNumber++;
    }

    return result;
  }

  public void applyToNodes(Consumer<SarTreeNode> consumer, Integer minSubtreeSize) throws IOException {
    Collection<SarTreeNode> nodesToProcess = getNodesAboveThresholdDescendants(minSubtreeSize);
    LOGGER.info("%d sars to score.", nodesToProcess.size());
    nodesToProcess.forEach(consumer);
  }

  private List<SarTreeNode> getNodesAboveThresholdDescendants(Integer minSubtreeSize) {
    List<SarTreeNode> result = new ArrayList<>();

    for (SarTreeNode node : getNodes()) {
      if (getSubtreeSize(node) >= minSubtreeSize) {
        result.add(node);
      }
    }

    return result;
  }

  /**
   * Pulls the SARs we care about out of a SarTree. These are chosen to be above a certain confidence level, and
   * explain at least a certain number of nodes. Additionally no SAR with less than one direct child is chosen:
   * no direct child indicates a leaf node, while one direct child means we can throw this node out and use its
   * child instead of it, to avoid duplicating SARs along a non-branching path.
   *
   * @param subtreeThreshold The min size of the subtree explained by a relevant SAR.
   * @param thresholdConfidence The min confidence score for a relevant SAR.
   * @return A list of SarTreeNodes
   */
  public SarTreeNodeList getExplanatoryNodes(int subtreeThreshold, double thresholdConfidence) {
    SarTreeNodeList results = new SarTreeNodeList();

    for (SarTreeNode node : getNodes()) {
      if (getChildren(node).size() > 1) {
        if (node.getPercentageHits() > thresholdConfidence && getSubtreeSize(node) >= subtreeThreshold) {
          results.addNode(node);
        }
      }
    }

    return results;
  }

  public Integer getSubtreeSize(SarTreeNode node) {
    return traverseSubtree(node).size();
  }

  /**
   * Depth first traversal of the subtree with a given root.
   *
   * @param subtreeRoot The root.
   * @return The list of SarTreeNodes in the given subtree.
   */
  public List<SarTreeNode> traverseSubtree(SarTreeNode subtreeRoot) {
    Stack<SarTreeNode> nodeStack = new Stack<>();
    return traverseSubtree(subtreeRoot, nodeStack);
  }

  /**
   * Utility function to implement a depth first traversal.
   * test().
   *
   * @param subtreeRoot The root of the subtree to search.
   * @param stack A stack to use to facilitate the search.
   * @return
   */
  private List<SarTreeNode> traverseSubtree(SarTreeNode subtreeRoot,
                                            Stack<SarTreeNode> stack) {
    List<SarTreeNode> nodes = new ArrayList<>();
    Integer initialSize = stack.size();
    stack.push(subtreeRoot);

    while (stack.size() > initialSize) {
      SarTreeNode nextNode = stack.pop();
      nodes.add(nextNode);
      for (SarTreeNode childNode : getChildren(nextNode)) {
        nodes.addAll(traverseSubtree(childNode, stack));
      }
    }
    return nodes;
  }

  /**
   * Gets the hierarchyId of the child of node with a particular index.
   *
   * @param node The parent node.
   * @param index A child index. Children are numbered from 1.
   * @return The child's hierarchy ID.
   */
  private String getChildHierarchyId(SarTreeNode node, int index) {
    if (index == 0) {
      throw new IllegalArgumentException("HierarchyIDs only use positive integer indices.");
    }
    return new StringBuilder(node.getHierarchyId())
        .append(".")
        .append(Integer.toString(index))
        .toString();
  }
}