/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.l2expansion;
import act.server.MongoDB;
import act.shared.Chemical;
import chemaxon.struc.Molecule;
import com.act.biointerpretation.Utils.ReactionProjector;
import com.act.biointerpretation.mechanisminspection.ErosCorpus;
import com.act.biointerpretation.sars.SarCorpus;
import com.act.jobs.FileChecker;
import com.act.jobs.JavaRunnable;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
/**
* Runs L2 Expansion
*/
public class L2ExpansionDriver {
private static final Logger LOGGER = LogManager.getFormatterLogger(L2ExpansionDriver.class);
private static Integer NO_MASS_THRESHOLD = Integer.MAX_VALUE;
private static final String OPTION_METABOLITES = "m";
private static final String OPTION_MASS_THRESHOLD = "M";
private static final String OPTION_RO_CORPUS = "c";
private static final String OPTION_RO_IDS = "r";
private static final String OPTION_SAR_CORPUS = "s";
private static final String OPTION_OUTPUT_PATH = "o";
private static final String OPTION_PROGRESS_PATH = "p";
private static final String OPTION_DB = "db";
private static final String OPTION_EXPANSION_TYPE = "t";
private static final String OPTION_ADDITIONAL_CHEMICALS = "p";
private static final String OPTION_HELP = "h";
public static final String HELP_MESSAGE =
"This class is used to carry out L2 expansion. It first applies every RO from the input RO list to " +
"every metabolite in the input metabolite list. Example input lists can be found on the NAS at " +
"MNT_SHARED_DATA/Gil/resources. This creates a list of predicted reactions, which are augmented " +
"with chemical ids and names, as well as reaction ids from the database. At the end of the run, " +
"the predictions are printed to a json file.";
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_METABOLITES)
.argName("metabolites path name")
.desc("The absolute path to the metabolites file.")
.hasArg()
.longOpt("metabolite-file")
.required(true)
);
add(Option.builder(OPTION_MASS_THRESHOLD)
.argName("mass threshold")
.desc("The maximum mass of a substrate, in daltons. Substrates with higher mass will be discarded.")
.hasArg()
.longOpt("mass-threshold")
.type(Integer.class)
);
add(Option.builder(OPTION_RO_CORPUS)
.argName("ro corpus")
.desc("The path to the file containing the eros corpus, if not the validation corpus. Ignored if " +
"running a SAR expansion.")
.hasArg()
.longOpt("ro-corpus")
);
add(Option.builder(OPTION_RO_IDS)
.argName("ro ids path name")
.desc("The path to a file containing the RO ids to use. If this option is omitted, " +
"all ROs in the corpus are used. Ignored if running a SAR expansion.")
.hasArg()
.longOpt("ro-file")
);
add(Option.builder(OPTION_SAR_CORPUS)
.argName("sar corpus")
.desc("The path to a file containing the sar corpus to use. Ignored if running an RO-only expansion.")
.hasArg()
.longOpt("sar-corpus")
);
add(Option.builder(OPTION_RO_IDS)
.argName("ro ids")
.desc("The absolute path to the file containing the RO ids to use. If this option is omitted, " +
"all ROs in the corpus are used.")
.hasArg()
.longOpt("ro-ids")
);
add(Option.builder(OPTION_OUTPUT_PATH)
.argName("output file path")
.desc("The path to the file to which to write the json file of predicted reactions.")
.hasArg()
.longOpt("output-file-path")
.required(true)
);
add(Option.builder(OPTION_PROGRESS_PATH)
.argName("progress file path")
.desc("The path to the file to which to write the json file of predicted reactions as each projection runs.")
.hasArg()
.longOpt("progress-file-path")
);
add(Option.builder(OPTION_DB)
.argName("db name")
.desc("The name of the mongo DB to use.")
.hasArg()
.longOpt("db-name")
);
add(Option.builder(OPTION_EXPANSION_TYPE)
.argName("type of expansion")
.desc("Type can take values: {ONE_SUB, TWO_SUB, SAR}. ONE_SUB and TWO_SUB operate with only ROs, on one " +
"and two substrates, respectively, using only ROs. SAR runs an expansion from a SarCorpus, which " +
"still applies ROs but additionally constrains the substrates of each RO based on the supplied SARs.")
.hasArg()
.longOpt("expansion-type")
.required(true)
);
add(Option.builder(OPTION_ADDITIONAL_CHEMICALS)
.argName("additional chemicals path name")
.desc("The absolute path to the additional chemicals file.")
.hasArg()
.longOpt("additional-chemicals-file")
);
add(Option.builder(OPTION_HELP)
.argName("help")
.desc("Prints this help message.")
.longOpt("help")
);
}};
public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
static {
HELP_FORMATTER.setWidth(100);
}
public static final String LOCAL_HOST = "localhost";
public static final Integer PORT_NUMBER = 27017;
public enum ExpansionType {
ONE_SUB,
TWO_SUB,
SAR,
}
public static void main(String[] args) throws Exception {
// Build command line parser.
Options opts = new Options();
for (Option.Builder b : OPTION_BUILDERS) {
opts.addOption(b.build());
}
CommandLine cl = null;
try {
CommandLineParser parser = new DefaultParser();
cl = parser.parse(opts, args);
} catch (ParseException e) {
LOGGER.error("Argument parsing failed: %s", e.getMessage());
HELP_FORMATTER.printHelp(L2ExpansionDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
// Print help.
if (cl.hasOption(OPTION_HELP)) {
HELP_FORMATTER.printHelp(L2ExpansionDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
return;
}
// Get output files.
String outputPath = cl.getOptionValue(OPTION_OUTPUT_PATH);
File outputFile = new File(outputPath);
if (outputFile.isDirectory() || outputFile.exists()) {
LOGGER.error("Supplied output file is a directory or already exists.");
System.exit(1);
}
outputFile.createNewFile();
File inchiOutputFile = new File(outputPath + ".inchis");
if (inchiOutputFile.isDirectory() || inchiOutputFile.exists()) {
LOGGER.error("Supplied inchi output file is a directory or already exists.");
System.exit(1);
}
inchiOutputFile.createNewFile();
Optional<OutputStream> maybeProgressStream = Optional.empty();
if (cl.hasOption(OPTION_PROGRESS_PATH)) {
String progressPath = cl.getOptionValue(OPTION_PROGRESS_PATH);
File progressFile = new File(progressPath);
LOGGER.info("Writing incremental results to file at %s", progressFile.getAbsolutePath());
if (progressFile.isDirectory() || progressFile.exists()) {
LOGGER.error("Supplied progress file is a directory or already exists.");
System.exit(1);
}
maybeProgressStream = Optional.of(new FileOutputStream(progressFile));
}
// Get metabolite list
L2InchiCorpus inchiCorpus = getInchiCorpus(cl, OPTION_METABOLITES);
LOGGER.info("%d substrate inchis.", inchiCorpus.getInchiList().size());
Integer maxMass = NO_MASS_THRESHOLD;
if (cl.hasOption(OPTION_MASS_THRESHOLD)) {
maxMass = Integer.parseInt(cl.getOptionValue(OPTION_MASS_THRESHOLD));
LOGGER.info("Filtering out substrates with mass more than %d daltons.", maxMass);
}
inchiCorpus.filterByMass(maxMass);
LOGGER.info("%d substrate inchis that are importable as molecules.", inchiCorpus.getInchiList().size());
PredictionGenerator generator = new AllPredictionsGenerator(new ReactionProjector());
L2Expander expander = buildExpander(cl, inchiCorpus, generator);
L2PredictionCorpus predictionCorpus = expander.getPredictions(maybeProgressStream);
LOGGER.info("Done with L2 expansion. Produced %d predictions.", predictionCorpus.getCorpus().size());
LOGGER.info("Writing corpus to file.");
predictionCorpus.writePredictionsToJsonFile(outputFile);
L2InchiCorpus productInchis = new L2InchiCorpus(predictionCorpus.getUniqueProductInchis());
productInchis.writeToFile(inchiOutputFile);
LOGGER.info("L2ExpansionDriver complete!");
}
private static L2Expander buildExpander(CommandLine cl,
L2InchiCorpus inchiCorpus,
PredictionGenerator generator) throws IOException {
ExpansionType expansionType = ExpansionType.valueOf(cl.getOptionValue(OPTION_EXPANSION_TYPE));
switch (expansionType) {
case ONE_SUB:
LOGGER.info("Running one substrate expansion");
return new SingleSubstrateRoExpander(getRoCorpus(cl), inchiCorpus.getMolecules(), generator);
case TWO_SUB:
LOGGER.info("Running two substrate expansion.");
LOGGER.warn("This functionality is still experimental as it is not currently tested.");
if (!cl.hasOption(OPTION_ADDITIONAL_CHEMICALS)) {
LOGGER.error("Must supply additional chemicals file for two substrate expansion.");
System.exit(1);
}
MongoDB mongoDB = new MongoDB(LOCAL_HOST, PORT_NUMBER, cl.getOptionValue(OPTION_DB)); // Start mongo instance.
L2InchiCorpus chemicalInchis = getInchiCorpus(cl, OPTION_ADDITIONAL_CHEMICALS);
List<Chemical> chemicalsOfInterest =
L2ExpansionDriver.convertListOfInchisToMolecules(chemicalInchis.getInchiList(), mongoDB);
List<Chemical> metaboliteChemicals =
L2ExpansionDriver.convertListOfInchisToMolecules(inchiCorpus.getInchiList(), mongoDB);
return new TwoSubstrateRoExpander(chemicalsOfInterest, metaboliteChemicals, getRoCorpus(cl), generator);
case SAR:
LOGGER.info("Running sar-based expansion.");
File sarCorpusFile = new File(cl.getOptionValue(OPTION_SAR_CORPUS));
if (!sarCorpusFile.exists() || sarCorpusFile.isDirectory()) {
LOGGER.error("Sar corpus is not a valid file.");
System.exit(1);
}
SarCorpus sarCorpus = SarCorpus.readCorpusFromJsonFile(sarCorpusFile);
return new SingleSubstrateSarExpander(sarCorpus, inchiCorpus.getMolecules(), generator);
default:
throw new IllegalArgumentException("Invalid expansion type.");
}
}
private static ErosCorpus getRoCorpus(CommandLine cl) throws IOException {
ErosCorpus eroCorpus = new ErosCorpus();
if (cl.hasOption(OPTION_RO_CORPUS)) {
File roCorpusFile = new File(cl.getOptionValue(OPTION_RO_CORPUS));
if (!roCorpusFile.exists()) {
LOGGER.error("Ro corpus file does not exist.");
System.exit(1);
}
FileInputStream roInputStream = new FileInputStream(roCorpusFile);
eroCorpus.loadCorpus(roInputStream);
} else {
eroCorpus.loadValidationCorpus();
}
if (cl.hasOption(OPTION_RO_IDS)) {
LOGGER.info("Filtering corpus by RO list from rosFile.");
File roIdsFile = new File(cl.getOptionValue(OPTION_RO_IDS));
if (!roIdsFile.exists()) {
LOGGER.error("Ro ids file does not exist.");
System.exit(1);
}
eroCorpus.filterCorpusByIdFile(roIdsFile);
} else {
LOGGER.info("Leaving all ROs in corpus.");
}
return eroCorpus;
}
/**
* Gets a list of inchis for a command line option that points to a file with one inchi per line.
*
* @param cl Command line parser.
* @param optionForFileName Option for a file with one inchi per line. Either the metabolite list or addition
* chemical list.
* @return The list of inchis contained in the file.
* @throws IOException
*/
private static L2InchiCorpus getInchiCorpus(CommandLine cl, String optionForFileName) throws IOException {
File inchisFile = new File(cl.getOptionValue(optionForFileName));
LOGGER.info("Getting inchi list from %s", inchisFile);
L2InchiCorpus inchiCorpus = new L2InchiCorpus();
inchiCorpus.loadCorpus(inchisFile);
return inchiCorpus;
}
/**
* This function constructs a mapping between inchi and its chemical representation.
*
* @param inchis A list of inchis
* @param mongoDB The db from which to get the chemical entry
* @return A map of inchi to chemical
*/
private static List<Chemical> convertListOfInchisToMolecules(List<String> inchis, MongoDB mongoDB) {
List<Chemical> result = new ArrayList<>();
for (String inchi : inchis) {
result.add(mongoDB.getChemicalFromInChI(inchi));
}
return result;
}
/**
* Wraps L2 expansion so that it can be used in a workflow. The inputs are a list of RO IDs to expand on,
* a file containing the substrates to apply the ROs to, and a file to which to write the output prediction corpus.
*
* @param roIds
* @param substrateListFile
* @param outputFile
* @return
*/
public static JavaRunnable getRunnableOneSubstrateRoExpander(List<Integer> roIds,
File substrateListFile,
File outputFile) {
return new JavaRunnable() {
@Override
public void run() throws IOException {
// Verify files
FileChecker.verifyInputFile(substrateListFile);
FileChecker.verifyAndCreateOutputFile(outputFile);
// Handle input ros
ErosCorpus roCorpus = new ErosCorpus();
roCorpus.loadValidationCorpus();
roCorpus.filterCorpusById(roIds);
// Handle input substrates
L2InchiCorpus inchis = new L2InchiCorpus();
inchis.loadCorpus(substrateListFile);
List<Molecule> moleculeList = inchis.getMolecules();
// Build expander
PredictionGenerator generator = new AllPredictionsGenerator(new ReactionProjector());
L2Expander expander = new SingleSubstrateRoExpander(roCorpus, moleculeList, generator);
// Run expander
L2PredictionCorpus predictions = expander.getPredictions();
// Write output
predictions.writePredictionsToJsonFile(outputFile);
}
@Override
public String toString() {
return "oneSubstrateRoExpander:" + roIds.toString();
}
};
}
}