/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation;
import act.server.DBIterator;
import act.server.MongoDB;
import act.shared.Chemical;
import act.shared.Organism;
import act.shared.Reaction;
import chemaxon.license.LicenseProcessingException;
import chemaxon.reaction.ReactionException;
import com.act.biointerpretation.cofactorremoval.CofactorRemover;
import com.act.biointerpretation.desalting.ReactionDesalter;
import com.act.biointerpretation.mechanisminspection.MechanisticValidator;
import com.act.biointerpretation.reactionmerging.ReactionMerger;
import com.act.biointerpretation.sequencemerging.SequenceMerger;
import com.act.lcms.db.io.LoadPlateCompositionIntoDB;
import com.act.utils.CLIUtil;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONObject;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;
/**
* This class extracts all product chemicals from reactions in an installer DB that contain proteins belonging to a
* class of user-specified organisms. Cofactors are included in the products extracted by this class. The
* type of organism to extract is defined by an organism name prefix: any reaction that contains a protein that
* references an organism whose name begins with the specified prefix is considered for extraction.
*
* Why would we want to extract just the products of reactions? Doing so allows us to produce a superset of all
* L2 molecules that we might see in the metabolome of an organism like humans or yeast. While we may not be able to
* explicitly declare that all of the extracted molecules are bio-reachable, their characterization in relation to a
* host organism gives us some evidence that we might see them in an LCMS scan.
*/
public class ProductExtractor {
private static final Logger LOGGER = LogManager.getFormatterLogger(ProductExtractor.class);
private static final String OPTION_ORGANISM_PREFIX = "r";
private static final String OPTION_OUTPUT_FILE = "o";
private static final String OPTION_DB_NAME = "n";
private static final String DEFAULT_DB_HOST = "localhost";
private static final Integer DEFAULT_DB_PORT = 27017;
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_ORGANISM_PREFIX)
.argName("organism prefix")
.desc("Organism prefix to use when filtering reactions")
.hasArg()
.required()
.longOpt("organism")
);
add(Option.builder(OPTION_OUTPUT_FILE)
.argName("output file")
.desc("The file to which to write product InChIs (default is stdout)")
.hasArg()
.longOpt("output")
);
add(Option.builder(OPTION_DB_NAME)
.argName("DB name")
.desc("The name of the DB from which to extract products")
.hasArg().required()
.longOpt("db-name")
);
}};
private static final String HELP_MESSAGE = StringUtils.join(new String[] {
"Extracts all products for reactions belonging ",
"to organisms whose names match a given prefix",
}, "");
public static void main(String[] args) throws Exception {
CLIUtil cliUtil = new CLIUtil(ProductExtractor.class, HELP_MESSAGE, OPTION_BUILDERS);
CommandLine cl = cliUtil.parseCommandLine(args);
String orgPrefix = cl.getOptionValue(OPTION_ORGANISM_PREFIX);
LOGGER.info("Using organism prefix %s", orgPrefix);
MongoDB db = new MongoDB(DEFAULT_DB_HOST, DEFAULT_DB_PORT, cl.getOptionValue(OPTION_DB_NAME));
Map<Long, String> validOrganisms = new TreeMap<>();
DBIterator orgIter = db.getDbIteratorOverOrgs();
Organism o = null;
while ((o = db.getNextOrganism(orgIter)) != null) {
if (!o.getName().isEmpty() && o.getName().startsWith(orgPrefix)) {
validOrganisms.put(o.getUUID(), o.getName());
}
}
LOGGER.info("Found %d valid organisms", validOrganisms.size());
Set<Long> productIds = new TreeSet<>(); // Use something with implicit ordering we can traverse in order.
DBIterator reactionIterator = db.getIteratorOverReactions();
Reaction r;
while ((r = db.getNextReaction(reactionIterator)) != null) {
Set<JSONObject> proteins = r.getProteinData();
boolean valid = false;
for (JSONObject j : proteins) {
if (j.has("organism") && validOrganisms.containsKey(j.getLong("organism"))) {
valid = true;
break;
} else if (j.has("organisms")) {
JSONArray organisms = j.getJSONArray("organisms");
for (int i = 0; i < organisms.length(); i++) {
if (validOrganisms.containsKey(organisms.getLong(i))) {
valid = true;
break;
}
}
}
}
if (valid) {
for (Long id : r.getProducts()) {
productIds.add(id);
}
for (Long id : r.getProductCofactors()) {
productIds.add(id);
}
}
}
LOGGER.info("Found %d valid product ids for '%s'", productIds.size(), orgPrefix);
PrintWriter writer = cl.hasOption(OPTION_OUTPUT_FILE) ?
new PrintWriter(new FileWriter(cl.getOptionValue(OPTION_OUTPUT_FILE))) :
new PrintWriter(System.out);
for (Long id : productIds) {
Chemical c = db.getChemicalFromChemicalUUID(id);
String inchi = c.getInChI();
if (inchi.startsWith("InChI=") && !inchi.startsWith("InChI=/FAKE")) {
writer.println(inchi);
}
}
if (cl.hasOption(OPTION_OUTPUT_FILE)) {
writer.close();
}
LOGGER.info("Done.");
}
}