/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.biointerpretation.l2expansion; import act.server.MongoDB; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.function.Predicate; public class L2FilteringDriver { private static final Logger LOGGER = LogManager.getFormatterLogger(L2FilteringDriver.class); private static final String OPTION_INPUT_CORPUS = "i"; private static final String OPTION_OUTPUT_PATH = "o"; private static final String OPTION_CHEMICAL_FILTER = "c"; private static final String OPTION_REACTION_FILTER = "r"; private static final String OPTION_DB_LOOKUP = "d"; private static final String OPTION_LOOKUP_TYPES = "L"; private static final String OPTION_SPLIT_BY_RO = "s"; private static final String OPTION_FILTER_SUBSTRATES = "S"; private static final String OPTION_HELP = "h"; private static final String APPLY_FILTER_POSITIVE = "1"; private static final String APPLY_FILTER_NEGATED = "0"; private static final String LOOKUP_REACTIONS = "r"; private static final String LOOKUP_CHEMICALS = "c"; public static final String HELP_MESSAGE = "This class is used to filter an L2PredictionCorpus. An initial corpus is read in from file, processed based on" + "the selected options, and then the result is printed in json format."; public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{ add(Option.builder(OPTION_INPUT_CORPUS) .argName("input corpus path") .desc("The absolute path to the input prediction corpus.") .hasArg() .longOpt("input-corpus-path") .required(true) ); add(Option.builder(OPTION_OUTPUT_PATH) .argName("output path") .desc("The path to which to write the output.") .hasArg() .longOpt("output-path") .required(true) ); add(Option.builder(OPTION_CHEMICAL_FILTER) .argName("chemical db filter") .desc("Use the chemical filter. Input the value " + APPLY_FILTER_POSITIVE + " to keep predictions whose " + "chemicals were all found in the DB, or " + APPLY_FILTER_NEGATED + " to keep those whose chemicals " + "were not all found. This step must either be run on a corpus that already has chemical DB info, or " + "supplied in conjunction with the db-lookup option to populate the chemical info fields before filtering.") .hasArg() .longOpt("chemical-db-filter") ); add(Option.builder(OPTION_REACTION_FILTER) .argName("reaction db filter") .desc("Use the reaction filter. Input the value " + APPLY_FILTER_POSITIVE + " to keep predictions which " + "match a reaction in the DB, or " + APPLY_FILTER_NEGATED + " to keep those which don't. This step must " + "either be run on a corpus that already has reaction DB info, supplied in conjunction with the db-lookup " + "option to populate the reaction info fields before filtering.") .hasArg() .longOpt("reaction-db-filter") ); add(Option.builder(OPTION_DB_LOOKUP) .argName("db name") .desc("Mongo DB to use for lookups; needed only if population of chemical and reaction DB info is desired..") .hasArg() .longOpt("db-name")); add(Option.builder(OPTION_LOOKUP_TYPES) .argName("db lookup types") .desc("This argument specifies which lookup types to use. Use " + LOOKUP_CHEMICALS + " for chemical lookups, " + LOOKUP_REACTIONS + " for reaction lookups, or both. These lookups compare the predictions against our DB " + "and populate the chemical and reaction fields of the L2Predictions accordingly.") .hasArgs() .valueSeparator(',') .longOpt("db-lookup-types")); add(Option.builder(OPTION_SPLIT_BY_RO) .argName("split by ro") .desc("If this argument is selected, the input corpus is read in, split up by ro, and written out into a " + "different output file for each ro found in the corpus. The files have the ro id appended to the end of " + "their names to distinguish them.") .longOpt("split-by-ro")); add(Option.builder(OPTION_FILTER_SUBSTRATES) .argName("filter substrates path") .desc("If this argument is selected, a list of substrates to keep is fed in, and the corpus is filtered " + "to preserve only predictions with substrates among that list.") .hasArg() .longOpt("filter-substrates")); add(Option.builder(OPTION_HELP) .argName("help") .desc("Prints this help message.") .longOpt("help") ); }}; public static final HelpFormatter HELP_FORMATTER = new HelpFormatter(); static { HELP_FORMATTER.setWidth(100); } private static final Predicate<L2Prediction> ALL_CHEMICALS_IN_DB = prediction -> prediction.getProductIds().size() == prediction.getProducts().size() && prediction.getSubstrateIds().size() == prediction.getSubstrates().size(); private static final Predicate<L2Prediction> REACTION_MATCHES_DB = prediction -> prediction.getReactionCount() > 0; public static void main(String[] args) throws Exception { // Build command line parser. Options opts = new Options(); for (Option.Builder b : OPTION_BUILDERS) { opts.addOption(b.build()); } CommandLine cl = null; try { CommandLineParser parser = new DefaultParser(); cl = parser.parse(opts, args); } catch (ParseException e) { LOGGER.error("Argument parsing failed: %s", e.getMessage()); HELP_FORMATTER.printHelp(L2FilteringDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } // Print help. if (cl.hasOption(OPTION_HELP)) { HELP_FORMATTER.printHelp(L2FilteringDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); return; } checkFilterOptionIsValid(OPTION_CHEMICAL_FILTER, cl); checkFilterOptionIsValid(OPTION_REACTION_FILTER, cl); // Get corpus files. File corpusFile = new File(cl.getOptionValue(OPTION_INPUT_CORPUS)); if (!corpusFile.exists()) { LOGGER.error("Input corpus file does not exist."); return; } File outputFile = new File(cl.getOptionValue(OPTION_OUTPUT_PATH)); outputFile.createNewFile(); if (outputFile.isDirectory()) { LOGGER.error("Output file is directory."); System.exit(1); } LOGGER.info("Reading corpus from file."); L2PredictionCorpus predictionCorpus = L2PredictionCorpus.readPredictionsFromJsonFile(corpusFile); LOGGER.info("Read in corpus with %d predictions.", predictionCorpus.getCorpus().size()); LOGGER.info("Corpus has %d distinct substrates.", predictionCorpus.getUniqueSubstrateInchis().size()); if (cl.hasOption(OPTION_FILTER_SUBSTRATES)) { LOGGER.info("Filtering by substrates."); File substratesFile = new File(cl.getOptionValue(OPTION_FILTER_SUBSTRATES)); L2InchiCorpus inchis = new L2InchiCorpus(); inchis.loadCorpus(substratesFile); Set<String> inchiSet = new HashSet<String>(); inchiSet.addAll(inchis.getInchiList()); predictionCorpus = predictionCorpus.applyFilter( prediction -> inchiSet.containsAll(prediction.getSubstrateInchis())); predictionCorpus.writePredictionsToJsonFile(outputFile); LOGGER.info("Done writing filtered corpus to file."); return; } if (cl.hasOption(OPTION_SPLIT_BY_RO)) { LOGGER.info("Splitting corpus into distinct corpuses for each ro."); Map<String, L2PredictionCorpus> corpusMap = predictionCorpus.splitCorpus(prediction -> prediction.getProjectorName()); for (Map.Entry<String, L2PredictionCorpus> entry : corpusMap.entrySet()) { String fileName = cl.getOptionValue(OPTION_OUTPUT_PATH) + "." + entry.getKey(); File oneOutputFile = new File(fileName); entry.getValue().writePredictionsToJsonFile(oneOutputFile); } LOGGER.info("Done writing split corpuses to file."); return; } predictionCorpus = runDbLookups(cl, predictionCorpus, opts); LOGGER.info("Applying filters."); predictionCorpus = applyFilter(predictionCorpus, ALL_CHEMICALS_IN_DB, cl, OPTION_CHEMICAL_FILTER); predictionCorpus = applyFilter(predictionCorpus, REACTION_MATCHES_DB, cl, OPTION_REACTION_FILTER); LOGGER.info("Filtered corpus has %d predictions.", predictionCorpus.getCorpus().size()); LOGGER.info("Printing final corpus."); predictionCorpus.writePredictionsToJsonFile(outputFile); LOGGER.info("L2FilteringDriver complete!."); } private static L2PredictionCorpus runDbLookups(CommandLine cl, L2PredictionCorpus predictionCorpus, Options opts) throws IOException { if (cl.hasOption(OPTION_DB_LOOKUP)) { if (cl.hasOption(OPTION_LOOKUP_TYPES)) { LOGGER.info("Instantiating mongoDB."); MongoDB mongoDB = new MongoDB("localhost", 27017, cl.getOptionValue(OPTION_DB_LOOKUP)); String[] lookupOptions = cl.getOptionValues(OPTION_LOOKUP_TYPES); Set<String> lookupSet = new HashSet<>(); for (String option : lookupOptions) { if (!option.equals(LOOKUP_CHEMICALS) && !option.equals(LOOKUP_REACTIONS)) { LOGGER.error("Invalid lookup option supplied: %s", option); HELP_FORMATTER.printHelp(L2FilteringDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } lookupSet.add(option); } if (lookupSet.contains(LOOKUP_CHEMICALS)) { LOGGER.info("Looking up chemicals in DB."); predictionCorpus = predictionCorpus.applyTransformation(new ChemicalsTransformer(mongoDB)); } if (lookupSet.contains(LOOKUP_REACTIONS)) { LOGGER.info("Looking up reactions in DB."); predictionCorpus = predictionCorpus.applyTransformation(new ReactionsTransformer(mongoDB)); } } else { LOGGER.warn("Mongo DB instantiated but lookup option not selected."); } } return predictionCorpus; } private static void checkFilterOptionIsValid(String filterOption, CommandLine cl) { if (cl.hasOption(filterOption)) { if (cl.getOptionValue(filterOption).equals(APPLY_FILTER_POSITIVE) || cl.getOptionValue(filterOption).equals(APPLY_FILTER_NEGATED)) { return; } else { LOGGER.error("Option %s value not valid. Must receive value %s or %s", filterOption, APPLY_FILTER_POSITIVE, APPLY_FILTER_NEGATED); throw new IllegalArgumentException("Command line value invalid."); } } } private static L2PredictionCorpus applyFilter(L2PredictionCorpus corpus, Predicate<L2Prediction> filter, CommandLine cl, String filterOption) throws IOException { if (cl.hasOption(filterOption)) { if (cl.getOptionValue(filterOption).equals(APPLY_FILTER_NEGATED)) { return corpus.applyFilter(filter.negate()); } return corpus.applyFilter(filter); } return corpus; } }