/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.biointerpretation; import act.server.DBIterator; import act.server.MongoDB; import act.shared.Chemical; import act.shared.Organism; import act.shared.Reaction; import chemaxon.license.LicenseProcessingException; import chemaxon.reaction.ReactionException; import com.act.biointerpretation.cofactorremoval.CofactorRemover; import com.act.biointerpretation.desalting.ReactionDesalter; import com.act.biointerpretation.mechanisminspection.MechanisticValidator; import com.act.biointerpretation.reactionmerging.ReactionMerger; import com.act.biointerpretation.sequencemerging.SequenceMerger; import com.act.lcms.db.io.LoadPlateCompositionIntoDB; import com.act.utils.CLIUtil; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.Validate; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.json.JSONArray; import org.json.JSONObject; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; /** * This class extracts all product chemicals from reactions in an installer DB that contain proteins belonging to a * class of user-specified organisms. Cofactors are included in the products extracted by this class. The * type of organism to extract is defined by an organism name prefix: any reaction that contains a protein that * references an organism whose name begins with the specified prefix is considered for extraction. * * Why would we want to extract just the products of reactions? Doing so allows us to produce a superset of all * L2 molecules that we might see in the metabolome of an organism like humans or yeast. While we may not be able to * explicitly declare that all of the extracted molecules are bio-reachable, their characterization in relation to a * host organism gives us some evidence that we might see them in an LCMS scan. */ public class ProductExtractor { private static final Logger LOGGER = LogManager.getFormatterLogger(ProductExtractor.class); private static final String OPTION_ORGANISM_PREFIX = "r"; private static final String OPTION_OUTPUT_FILE = "o"; private static final String OPTION_DB_NAME = "n"; private static final String DEFAULT_DB_HOST = "localhost"; private static final Integer DEFAULT_DB_PORT = 27017; public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{ add(Option.builder(OPTION_ORGANISM_PREFIX) .argName("organism prefix") .desc("Organism prefix to use when filtering reactions") .hasArg() .required() .longOpt("organism") ); add(Option.builder(OPTION_OUTPUT_FILE) .argName("output file") .desc("The file to which to write product InChIs (default is stdout)") .hasArg() .longOpt("output") ); add(Option.builder(OPTION_DB_NAME) .argName("DB name") .desc("The name of the DB from which to extract products") .hasArg().required() .longOpt("db-name") ); }}; private static final String HELP_MESSAGE = StringUtils.join(new String[] { "Extracts all products for reactions belonging ", "to organisms whose names match a given prefix", }, ""); public static void main(String[] args) throws Exception { CLIUtil cliUtil = new CLIUtil(ProductExtractor.class, HELP_MESSAGE, OPTION_BUILDERS); CommandLine cl = cliUtil.parseCommandLine(args); String orgPrefix = cl.getOptionValue(OPTION_ORGANISM_PREFIX); LOGGER.info("Using organism prefix %s", orgPrefix); MongoDB db = new MongoDB(DEFAULT_DB_HOST, DEFAULT_DB_PORT, cl.getOptionValue(OPTION_DB_NAME)); Map<Long, String> validOrganisms = new TreeMap<>(); DBIterator orgIter = db.getDbIteratorOverOrgs(); Organism o = null; while ((o = db.getNextOrganism(orgIter)) != null) { if (!o.getName().isEmpty() && o.getName().startsWith(orgPrefix)) { validOrganisms.put(o.getUUID(), o.getName()); } } LOGGER.info("Found %d valid organisms", validOrganisms.size()); Set<Long> productIds = new TreeSet<>(); // Use something with implicit ordering we can traverse in order. DBIterator reactionIterator = db.getIteratorOverReactions(); Reaction r; while ((r = db.getNextReaction(reactionIterator)) != null) { Set<JSONObject> proteins = r.getProteinData(); boolean valid = false; for (JSONObject j : proteins) { if (j.has("organism") && validOrganisms.containsKey(j.getLong("organism"))) { valid = true; break; } else if (j.has("organisms")) { JSONArray organisms = j.getJSONArray("organisms"); for (int i = 0; i < organisms.length(); i++) { if (validOrganisms.containsKey(organisms.getLong(i))) { valid = true; break; } } } } if (valid) { for (Long id : r.getProducts()) { productIds.add(id); } for (Long id : r.getProductCofactors()) { productIds.add(id); } } } LOGGER.info("Found %d valid product ids for '%s'", productIds.size(), orgPrefix); PrintWriter writer = cl.hasOption(OPTION_OUTPUT_FILE) ? new PrintWriter(new FileWriter(cl.getOptionValue(OPTION_OUTPUT_FILE))) : new PrintWriter(System.out); for (Long id : productIds) { Chemical c = db.getChemicalFromChemicalUUID(id); String inchi = c.getInChI(); if (inchi.startsWith("InChI=") && !inchi.startsWith("InChI=/FAKE")) { writer.println(inchi); } } if (cl.hasOption(OPTION_OUTPUT_FILE)) { writer.close(); } LOGGER.info("Done."); } }