LibMcsClustering.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.sarinference;

import chemaxon.clustering.LibraryMCS;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.struc.Molecule;
import com.act.biointerpretation.l2expansion.L2FilteringDriver;
import com.act.biointerpretation.l2expansion.L2Prediction;
import com.act.biointerpretation.l2expansion.L2PredictionCorpus;
import com.act.jobs.FileChecker;
import com.act.jobs.JavaRunnable;
import com.act.lcms.db.io.report.IonAnalysisInterchangeModel;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;

public class LibMcsClustering {

  private static final Logger LOGGER = LogManager.getFormatterLogger(LibMcsClustering.class);

  private static final String OPTION_PREDICTION_CORPUS = "c";
  private static final String OPTION_POSITIVE_INCHIS = "p";
  private static final String OPTION_SARTREE_PATH = "t";
  private static final String OPTION_SCORED_SAR_PATH = "s";
  private static final String OPTION_HELP = "h";

  public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
    {
      add(Option.builder(OPTION_PREDICTION_CORPUS)
          .argName("input corpus path")
          .desc("The absolute path to the input prediction corpus.")
          .hasArg()
          .longOpt("input-corpus-path")
          .required()
      );
      add(Option.builder(OPTION_POSITIVE_INCHIS)
          .argName("positive inchis file")
          .desc("The path to a file of positive inchis from LCMS analysis of the prediction corpus.")
          .hasArg()
          .longOpt("input-positive-inchis")
          .required()
      );
      add(Option.builder(OPTION_SARTREE_PATH)
          .argName("sartree path")
          .desc("The path to which to write the intermediate file produced by structure clustering.")
          .hasArg()
          .longOpt("sartree-path")
          .required()
      );
      add(Option.builder(OPTION_SCORED_SAR_PATH)
          .argName("scored sar path")
          .desc("The path to which to write the final output file of scored sars.")
          .hasArg()
          .longOpt("scored-sar-path")
          .required()
      );
      add(Option.builder(OPTION_HELP)
          .argName("help")
          .desc("Prints this help message.")
          .longOpt("help")
      );
    }
  };

  public static final String HELP_MESSAGE =
      "This class is used to build sars from an L2Prediction run and LCMS analysis results.  The inputs are an " +
          "L2PredictionCorpus and a file with all the product inchis that came up as positive in the LCMS analysis. " +
          "The output is a list of Sars with percentageHits scores based on how predictive they are of the " +
          "reactions in the corpus.";

  public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

  static {
    HELP_FORMATTER.setWidth(100);
  }

  // This heuristically balances the consideration of wanting high percentage to rank
  // highly, but not wanting those with only a couple of total matches to rank highly,
  private static final SarTreeNode.ScoringFunctions SAR_SCORING_FUNCTION = SarTreeNode.ScoringFunctions.HIT_MINUS_MISS;

  private static final Integer THRESHOLD_TREE_SIZE = 2; // any SAR that is not simply one specific substrate is allowed

  public static void main(String[] args) throws Exception {

    // Build command line parser.
    Options opts = new Options();
    for (Option.Builder b : OPTION_BUILDERS) {
      opts.addOption(b.build());
    }
    CommandLine cl = null;
    try {
      CommandLineParser parser = new DefaultParser();
      cl = parser.parse(opts, args);
    } catch (ParseException e) {
      LOGGER.error("Argument parsing failed: %s", e.getMessage());
      exitWithHelp(opts);
    }

    // Print help.
    if (cl.hasOption(OPTION_HELP)) {
      HELP_FORMATTER.printHelp(L2FilteringDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
      return;
    }

    File inputCorpusFile = new File(cl.getOptionValue(OPTION_PREDICTION_CORPUS));
    File positiveInchisFile = new File(cl.getOptionValue(OPTION_POSITIVE_INCHIS));
    File sartreeFile = new File(cl.getOptionValue(OPTION_SARTREE_PATH));
    File scoredSarFile = new File(cl.getOptionValue(OPTION_SCORED_SAR_PATH));

    JavaRunnable clusterer = getClusterer(inputCorpusFile, sartreeFile);
    JavaRunnable scorer = getSarScorer(
        inputCorpusFile,
        sartreeFile,
        positiveInchisFile,
        scoredSarFile,
        SAR_SCORING_FUNCTION,
        THRESHOLD_TREE_SIZE);

    LOGGER.info("Running clustering.");
    clusterer.run();

    LOGGER.info("Running sar scoring.");
    scorer.run();

    LOGGER.info("Complete!.");
  }

  private static void exitWithHelp(Options opts) {
    HELP_FORMATTER.printHelp(L2FilteringDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
    System.exit(1);
  }

  /**
   * Reads in a prediction corpus, containing only one RO's predictions, and builds a clustering tree of the
   * substrates. Returns every SarTreeNode in the tree.
   * This method is static because it does not rely on any properties of the enclosing class to construct the job.
   * TODO: It would probably make more sense to make this its own class, i.e. <Clusterer implements JavaRunnable>
   *
   * @param predictionCorpusInput The prediction corpus input file.
   * @param sarTreeNodesOutput The file to which to write the SarTreeNodeList of every node in the clustering tree.
   * @return A JavaRunnable to run the appropriate clustering.
   */
  public static JavaRunnable getClusterer(File predictionCorpusInput, File sarTreeNodesOutput) {

    return new JavaRunnable() {
      @Override
      public void run() throws IOException {
        // Verify input and output files
        FileChecker.verifyInputFile(predictionCorpusInput);
        FileChecker.verifyAndCreateOutputFile(sarTreeNodesOutput);

        // Build input corpus and substrate list
        L2PredictionCorpus inputCorpus = L2PredictionCorpus.readPredictionsFromJsonFile(predictionCorpusInput);
        // Get list of molecules, tagged by PredictionId so we can track back to the corresponding predictions later on.
        Collection<Molecule> molecules = importMoleculesWithPredictionIds(inputCorpus);

        // Run substrate clustering
        SarTree sarTree = new SarTree();
        try {
          sarTree.buildByClustering(new LibraryMCS(), molecules);
        } catch (InterruptedException e) {
          LOGGER.error("Threw interrupted exception during buildByClustering: %s", e.getMessage());
          throw new RuntimeException(e);
        }

        // Write output to file
        SarTreeNodeList nodeList = new SarTreeNodeList(new ArrayList<>(sarTree.getNodes()));
        nodeList.writeToFile(sarTreeNodesOutput);
      }

      @Override
      public String toString() {
        return "SarClusterer:" + predictionCorpusInput.getName();
      }

      /**
       * Imports the substrate molecules from the prediction corpus, and tags them with their prediction IDs from
       * the corpus, using Chemaxon's setPropertyObject. These prediction Ids can then be recovered from the molecules
       * later, and mapped back to the original inchis, even if the molecule has been changed so that its inchi is no
       * longer the same. In particular, LibraryMCS strips stereo and other layers from the inchis, which would make
       * it hard to track LCMS hit sand misses between the output molecules of LibraryMCS and the initial inchis fed
       * into LibraryMCS.
       *
       * @param inputCorpus The prediction corpus.
       * @return A list of molecules, tagged with a property objects which contains the corresponding list of
       * prediction ids..
       * @throws MolFormatException If a molecule cannot be imported from an inchi.
       */
      private Collection<Molecule> importMoleculesWithPredictionIds(L2PredictionCorpus inputCorpus)
          throws MolFormatException {
        Map<String, Molecule> inchiToMoleculeMap = new HashMap<>();
        for (L2Prediction prediction : inputCorpus.getCorpus()) {
          for (String substrateInchi : prediction.getSubstrateInchis()) { // For now this should only be one substrate
            if (!inchiToMoleculeMap.containsKey(substrateInchi)) {
              Molecule mol = importMoleculeWithPredictionId(substrateInchi, prediction.getId());
              inchiToMoleculeMap.put(substrateInchi, mol);
            } else {
              List<Integer> predictionIds =
                  (ArrayList<Integer>) inchiToMoleculeMap
                      .get(substrateInchi)
                      .getPropertyObject(SarTreeNode.PREDICTION_ID_KEY);
              predictionIds.add(prediction.getId());
            }
          }
        }
        return inchiToMoleculeMap.values();
      }

      private Molecule importMoleculeWithPredictionId(String inchi, Integer id) throws MolFormatException {
        Molecule mol = MolImporter.importMol(inchi, "inchi");
        List<Integer> predictionIds = new ArrayList<>();
        predictionIds.add(id);
        mol.setPropertyObject(SarTreeNode.PREDICTION_ID_KEY, predictionIds);
        return mol;
      }
    };
  }

  /**
   * Reads in an already-built SarTree from a SarTreeNodeList, and scores the SARs based on LCMS results.  Currently
   * LCMS results are a dummy function that randomly classifies molecules as hits and misses.
   * This method is static because it does not rely on any properties of the enclosing class to construct the job.
   * TODO: It would probably make more sense to make this its own class, i.e. <SarScorer implements JavaRunnable>
   *
   * @param sarTreeInput An input file containing a SarTreeNodeList with all Sars from the clustering tree.
   * @param lcmsInput File with LCMS hits.
   * @param sarTreeNodeOutput The output file to which to write the relevant SARs from the corpus, sorted in decreasing
   * order of confidence.
   * @param scoringFunction The function used to score and rank the SARs.
   * @param subtreeThreshold The minimum number of leaves a sAR should match to be returned.
   * @return A JavaRunnable to run the SAR scoring.
   */
  public static JavaRunnable getSarScorer(
      File predictionsFile,
      File sarTreeInput,
      File lcmsInput,
      File sarTreeNodeOutput,
      SarTreeNode.ScoringFunctions scoringFunction,
      Integer subtreeThreshold) {

    return new JavaRunnable() {
      @Override
      public void run() throws IOException {
        // Verify input and output files
        FileChecker.verifyInputFile(predictionsFile);
        FileChecker.verifyInputFile(lcmsInput);
        FileChecker.verifyInputFile(sarTreeInput);
        FileChecker.verifyAndCreateOutputFile(sarTreeNodeOutput);

        L2PredictionCorpus predictionCorpus = L2PredictionCorpus.readPredictionsFromJsonFile(predictionsFile);

        // Build SAR tree from input file
        SarTreeNodeList nodeList = new SarTreeNodeList();
        nodeList.loadFromFile(sarTreeInput);
        SarTree sarTree = new SarTree();
        nodeList.getSarTreeNodes().forEach(node -> sarTree.addNode(node));

        // Build LCMS results
        IonAnalysisInterchangeModel lcmsResults = new IonAnalysisInterchangeModel();
        lcmsResults.loadResultsFromFile(lcmsInput);

        // Build sar scorer from LCMS and prediction corpus
        SarTreeBasedCalculator sarScorer = new SarTreeBasedCalculator(sarTree, predictionCorpus, lcmsResults);

        // Score SARs
        // Calculate hits and misses
        sarTree.applyToNodes(sarScorer, subtreeThreshold);
        // Retain nodes that are not repeats or leaves, and have more than 0 LCMS hits
        SarTreeNodeList treeNodeList = sarTree.getExplanatoryNodes(subtreeThreshold, 0.0);
        // Sort by the supplied scoring function.
        treeNodeList.sortBy(scoringFunction);

        // Write out output.
        treeNodeList.writeToFile(sarTreeNodeOutput);
      }

      @Override
      public String toString() {
        return "SarScorer:" + sarTreeInput.getName();
      }
    };
  }
}