L2ExpansionDriver.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.l2expansion;

import act.server.MongoDB;
import act.shared.Chemical;
import chemaxon.struc.Molecule;
import com.act.biointerpretation.Utils.ReactionProjector;
import com.act.biointerpretation.mechanisminspection.ErosCorpus;
import com.act.biointerpretation.sars.SarCorpus;
import com.act.jobs.FileChecker;
import com.act.jobs.JavaRunnable;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

/**
 * Runs L2 Expansion
 */
public class L2ExpansionDriver {

  private static final Logger LOGGER = LogManager.getFormatterLogger(L2ExpansionDriver.class);

  private static Integer NO_MASS_THRESHOLD = Integer.MAX_VALUE;

  private static final String OPTION_METABOLITES = "m";
  private static final String OPTION_MASS_THRESHOLD = "M";
  private static final String OPTION_RO_CORPUS = "c";
  private static final String OPTION_RO_IDS = "r";
  private static final String OPTION_SAR_CORPUS = "s";
  private static final String OPTION_OUTPUT_PATH = "o";
  private static final String OPTION_PROGRESS_PATH = "p";
  private static final String OPTION_DB = "db";
  private static final String OPTION_EXPANSION_TYPE = "t";
  private static final String OPTION_ADDITIONAL_CHEMICALS = "p";
  private static final String OPTION_HELP = "h";

  public static final String HELP_MESSAGE =
      "This class is used to carry out L2 expansion. It first applies every RO from the input RO list to " +
          "every metabolite in the input metabolite list.  Example input lists can be found on the NAS at " +
          "MNT_SHARED_DATA/Gil/resources. This creates a list of predicted reactions, which are augmented " +
          "with chemical ids and names, as well as reaction ids from the database. At the end of the run, " +
          "the predictions are printed to a json file.";

  public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
    add(Option.builder(OPTION_METABOLITES)
        .argName("metabolites path name")
        .desc("The absolute path to the metabolites file.")
        .hasArg()
        .longOpt("metabolite-file")
        .required(true)
    );
    add(Option.builder(OPTION_MASS_THRESHOLD)
        .argName("mass threshold")
        .desc("The maximum mass of a substrate, in daltons. Substrates with higher mass will be discarded.")
        .hasArg()
        .longOpt("mass-threshold")
        .type(Integer.class)
    );
    add(Option.builder(OPTION_RO_CORPUS)
        .argName("ro corpus")
        .desc("The path to the file containing the eros corpus, if not the validation corpus. Ignored if " +
            "running a SAR expansion.")
        .hasArg()
        .longOpt("ro-corpus")
    );
    add(Option.builder(OPTION_RO_IDS)
        .argName("ro ids path name")
        .desc("The path to a file containing the RO ids to use. If this option is omitted, " +
            "all ROs in the corpus are used. Ignored if running a SAR expansion.")
        .hasArg()
        .longOpt("ro-file")
    );
    add(Option.builder(OPTION_SAR_CORPUS)
        .argName("sar corpus")
        .desc("The path to a file containing the sar corpus to use. Ignored if running an RO-only expansion.")
        .hasArg()
        .longOpt("sar-corpus")
    );
    add(Option.builder(OPTION_RO_IDS)
        .argName("ro ids")
        .desc("The absolute path to the file containing the RO ids to use. If this option is omitted, " +
            "all ROs in the corpus are used.")
        .hasArg()
        .longOpt("ro-ids")
    );
    add(Option.builder(OPTION_OUTPUT_PATH)
        .argName("output file path")
        .desc("The path to the file to which to write the json file of predicted reactions.")
        .hasArg()
        .longOpt("output-file-path")
        .required(true)
    );
    add(Option.builder(OPTION_PROGRESS_PATH)
        .argName("progress file path")
        .desc("The path to the file to which to write the json file of predicted reactions as each projection runs.")
        .hasArg()
        .longOpt("progress-file-path")
    );
    add(Option.builder(OPTION_DB)
        .argName("db name")
        .desc("The name of the mongo DB to use.")
        .hasArg()
        .longOpt("db-name")
    );
    add(Option.builder(OPTION_EXPANSION_TYPE)
        .argName("type of expansion")
        .desc("Type can take values: {ONE_SUB, TWO_SUB, SAR}.  ONE_SUB and TWO_SUB operate with only ROs, on one " +
            "and two substrates, respectively, using only ROs. SAR runs an expansion from a SarCorpus, which " +
            "still applies ROs but additionally constrains the substrates of each RO based on the supplied SARs.")
        .hasArg()
        .longOpt("expansion-type")
        .required(true)
    );
    add(Option.builder(OPTION_ADDITIONAL_CHEMICALS)
        .argName("additional chemicals path name")
        .desc("The absolute path to the additional chemicals file.")
        .hasArg()
        .longOpt("additional-chemicals-file")
    );
    add(Option.builder(OPTION_HELP)
        .argName("help")
        .desc("Prints this help message.")
        .longOpt("help")
    );
  }};

  public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

  static {
    HELP_FORMATTER.setWidth(100);
  }

  public static final String LOCAL_HOST = "localhost";
  public static final Integer PORT_NUMBER = 27017;

  public enum ExpansionType {
    ONE_SUB,
    TWO_SUB,
    SAR,
  }

  public static void main(String[] args) throws Exception {

    // Build command line parser.
    Options opts = new Options();
    for (Option.Builder b : OPTION_BUILDERS) {
      opts.addOption(b.build());
    }

    CommandLine cl = null;
    try {
      CommandLineParser parser = new DefaultParser();
      cl = parser.parse(opts, args);
    } catch (ParseException e) {
      LOGGER.error("Argument parsing failed: %s", e.getMessage());
      HELP_FORMATTER.printHelp(L2ExpansionDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
      System.exit(1);
    }

    // Print help.
    if (cl.hasOption(OPTION_HELP)) {
      HELP_FORMATTER.printHelp(L2ExpansionDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
      return;
    }

    // Get output files.
    String outputPath = cl.getOptionValue(OPTION_OUTPUT_PATH);
    File outputFile = new File(outputPath);
    if (outputFile.isDirectory() || outputFile.exists()) {
      LOGGER.error("Supplied output file is a directory or already exists.");
      System.exit(1);
    }
    outputFile.createNewFile();
    File inchiOutputFile = new File(outputPath + ".inchis");
    if (inchiOutputFile.isDirectory() || inchiOutputFile.exists()) {
      LOGGER.error("Supplied inchi output file is a directory or already exists.");
      System.exit(1);
    }
    inchiOutputFile.createNewFile();

    Optional<OutputStream> maybeProgressStream = Optional.empty();
    if (cl.hasOption(OPTION_PROGRESS_PATH)) {
      String progressPath = cl.getOptionValue(OPTION_PROGRESS_PATH);
      File progressFile = new File(progressPath);
      LOGGER.info("Writing incremental results to file at %s", progressFile.getAbsolutePath());
      if (progressFile.isDirectory() || progressFile.exists()) {
        LOGGER.error("Supplied progress file is a directory or already exists.");
        System.exit(1);
      }
      maybeProgressStream = Optional.of(new FileOutputStream(progressFile));
    }

    // Get metabolite list
    L2InchiCorpus inchiCorpus = getInchiCorpus(cl, OPTION_METABOLITES);
    LOGGER.info("%d substrate inchis.", inchiCorpus.getInchiList().size());

    Integer maxMass = NO_MASS_THRESHOLD;
    if (cl.hasOption(OPTION_MASS_THRESHOLD)) {
      maxMass = Integer.parseInt(cl.getOptionValue(OPTION_MASS_THRESHOLD));
      LOGGER.info("Filtering out substrates with mass more than %d daltons.", maxMass);
    }
    inchiCorpus.filterByMass(maxMass);
    LOGGER.info("%d substrate inchis that are importable as molecules.", inchiCorpus.getInchiList().size());

    PredictionGenerator generator = new AllPredictionsGenerator(new ReactionProjector());

    L2Expander expander = buildExpander(cl, inchiCorpus, generator);
    L2PredictionCorpus predictionCorpus = expander.getPredictions(maybeProgressStream);

    LOGGER.info("Done with L2 expansion. Produced %d predictions.", predictionCorpus.getCorpus().size());

    LOGGER.info("Writing corpus to file.");
    predictionCorpus.writePredictionsToJsonFile(outputFile);
    L2InchiCorpus productInchis = new L2InchiCorpus(predictionCorpus.getUniqueProductInchis());
    productInchis.writeToFile(inchiOutputFile);
    LOGGER.info("L2ExpansionDriver complete!");
  }

  private static L2Expander buildExpander(CommandLine cl,
                                          L2InchiCorpus inchiCorpus,
                                          PredictionGenerator generator) throws IOException {

    ExpansionType expansionType = ExpansionType.valueOf(cl.getOptionValue(OPTION_EXPANSION_TYPE));

    switch (expansionType) {
      case ONE_SUB:
        LOGGER.info("Running one substrate expansion");
        return new SingleSubstrateRoExpander(getRoCorpus(cl), inchiCorpus.getMolecules(), generator);

      case TWO_SUB:
        LOGGER.info("Running two substrate expansion.");
        LOGGER.warn("This functionality is still experimental as it is not currently tested.");
        if (!cl.hasOption(OPTION_ADDITIONAL_CHEMICALS)) {
          LOGGER.error("Must supply additional chemicals file for two substrate expansion.");
          System.exit(1);
        }
        MongoDB mongoDB = new MongoDB(LOCAL_HOST, PORT_NUMBER, cl.getOptionValue(OPTION_DB)); // Start mongo instance.
        L2InchiCorpus chemicalInchis = getInchiCorpus(cl, OPTION_ADDITIONAL_CHEMICALS);
        List<Chemical> chemicalsOfInterest =
            L2ExpansionDriver.convertListOfInchisToMolecules(chemicalInchis.getInchiList(), mongoDB);
        List<Chemical> metaboliteChemicals =
            L2ExpansionDriver.convertListOfInchisToMolecules(inchiCorpus.getInchiList(), mongoDB);
        return new TwoSubstrateRoExpander(chemicalsOfInterest, metaboliteChemicals, getRoCorpus(cl), generator);

      case SAR:
        LOGGER.info("Running sar-based expansion.");
        File sarCorpusFile = new File(cl.getOptionValue(OPTION_SAR_CORPUS));
        if (!sarCorpusFile.exists() || sarCorpusFile.isDirectory()) {
          LOGGER.error("Sar corpus is not a valid file.");
          System.exit(1);
        }
        SarCorpus sarCorpus = SarCorpus.readCorpusFromJsonFile(sarCorpusFile);
        return new SingleSubstrateSarExpander(sarCorpus, inchiCorpus.getMolecules(), generator);

      default:
        throw new IllegalArgumentException("Invalid expansion type.");
    }
  }

  private static ErosCorpus getRoCorpus(CommandLine cl) throws IOException {
    ErosCorpus eroCorpus = new ErosCorpus();
    if (cl.hasOption(OPTION_RO_CORPUS)) {
      File roCorpusFile = new File(cl.getOptionValue(OPTION_RO_CORPUS));

      if (!roCorpusFile.exists()) {
        LOGGER.error("Ro corpus file does not exist.");
        System.exit(1);
      }
      FileInputStream roInputStream = new FileInputStream(roCorpusFile);
      eroCorpus.loadCorpus(roInputStream);
    } else {
      eroCorpus.loadValidationCorpus();
    }

    if (cl.hasOption(OPTION_RO_IDS)) {
      LOGGER.info("Filtering corpus by RO list from rosFile.");
      File roIdsFile = new File(cl.getOptionValue(OPTION_RO_IDS));

      if (!roIdsFile.exists()) {
        LOGGER.error("Ro ids file does not exist.");
        System.exit(1);
      }

      eroCorpus.filterCorpusByIdFile(roIdsFile);
    } else {
      LOGGER.info("Leaving all ROs in corpus.");
    }

    return eroCorpus;
  }


  /**
   * Gets a list of inchis for a command line option that points to a file with one inchi per line.
   *
   * @param cl Command line parser.
   * @param optionForFileName Option for a file with one inchi per line. Either the metabolite list or addition
   * chemical list.
   * @return The list of inchis contained in the file.
   * @throws IOException
   */
  private static L2InchiCorpus getInchiCorpus(CommandLine cl, String optionForFileName) throws IOException {
    File inchisFile = new File(cl.getOptionValue(optionForFileName));
    LOGGER.info("Getting inchi list from %s", inchisFile);
    L2InchiCorpus inchiCorpus = new L2InchiCorpus();
    inchiCorpus.loadCorpus(inchisFile);
    return inchiCorpus;
  }

  /**
   * This function constructs a mapping between inchi and its chemical representation.
   *
   * @param inchis A list of inchis
   * @param mongoDB The db from which to get the chemical entry
   * @return A map of inchi to chemical
   */
  private static List<Chemical> convertListOfInchisToMolecules(List<String> inchis, MongoDB mongoDB) {
    List<Chemical> result = new ArrayList<>();
    for (String inchi : inchis) {
      result.add(mongoDB.getChemicalFromInChI(inchi));
    }
    return result;
  }

  /**
   * Wraps L2 expansion so that it can be used in a workflow. The inputs are a list of RO IDs to expand on,
   * a file containing the substrates to apply the ROs to, and a file to which to write the output prediction corpus.
   *
   * @param roIds
   * @param substrateListFile
   * @param outputFile
   * @return
   */
  public static JavaRunnable getRunnableOneSubstrateRoExpander(List<Integer> roIds,
                                                               File substrateListFile,
                                                               File outputFile) {
    return new JavaRunnable() {
      @Override
      public void run() throws IOException {
        // Verify files
        FileChecker.verifyInputFile(substrateListFile);
        FileChecker.verifyAndCreateOutputFile(outputFile);

        // Handle input ros
        ErosCorpus roCorpus = new ErosCorpus();
        roCorpus.loadValidationCorpus();
        roCorpus.filterCorpusById(roIds);

        // Handle input substrates
        L2InchiCorpus inchis = new L2InchiCorpus();
        inchis.loadCorpus(substrateListFile);
        List<Molecule> moleculeList = inchis.getMolecules();

        // Build expander
        PredictionGenerator generator = new AllPredictionsGenerator(new ReactionProjector());
        L2Expander expander = new SingleSubstrateRoExpander(roCorpus, moleculeList, generator);

        // Run expander
        L2PredictionCorpus predictions = expander.getPredictions();

        // Write output
        predictions.writePredictionsToJsonFile(outputFile);
      }

      @Override
      public String toString() {
        return "oneSubstrateRoExpander:" + roIds.toString();
      }
    };
  }
}