ProductExtractor.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation;

import act.server.DBIterator;
import act.server.MongoDB;
import act.shared.Chemical;
import act.shared.Organism;
import act.shared.Reaction;
import chemaxon.license.LicenseProcessingException;
import chemaxon.reaction.ReactionException;
import com.act.biointerpretation.cofactorremoval.CofactorRemover;
import com.act.biointerpretation.desalting.ReactionDesalter;
import com.act.biointerpretation.mechanisminspection.MechanisticValidator;
import com.act.biointerpretation.reactionmerging.ReactionMerger;
import com.act.biointerpretation.sequencemerging.SequenceMerger;
import com.act.lcms.db.io.LoadPlateCompositionIntoDB;
import com.act.utils.CLIUtil;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONObject;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;

/**
 * This class extracts all product chemicals from reactions in an installer DB that contain proteins belonging to a
 * class of user-specified organisms.  Cofactors are included in the products extracted by this class.  The
 * type of organism to extract is defined by an organism name prefix: any reaction that contains a protein that
 * references an organism whose name begins with the specified prefix is considered for extraction.
 *
 * Why would we want to extract just the products of reactions?  Doing so allows us to produce a superset of all
 * L2 molecules that we might see in the metabolome of an organism like humans or yeast.  While we may not be able to
 * explicitly declare that all of the extracted molecules are bio-reachable, their characterization in relation to a
 * host organism gives us some evidence that we might see them in an LCMS scan.
 */
public class ProductExtractor {
  private static final Logger LOGGER = LogManager.getFormatterLogger(ProductExtractor.class);

  private static final String OPTION_ORGANISM_PREFIX = "r";
  private static final String OPTION_OUTPUT_FILE = "o";
  private static final String OPTION_DB_NAME = "n";
  private static final String DEFAULT_DB_HOST = "localhost";
  private static final Integer DEFAULT_DB_PORT = 27017;

  public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
    add(Option.builder(OPTION_ORGANISM_PREFIX)
        .argName("organism prefix")
        .desc("Organism prefix to use when filtering reactions")
        .hasArg()
        .required()
        .longOpt("organism")
    );
    add(Option.builder(OPTION_OUTPUT_FILE)
        .argName("output file")
        .desc("The file to which to write product InChIs (default is stdout)")
        .hasArg()
        .longOpt("output")
    );
    add(Option.builder(OPTION_DB_NAME)
        .argName("DB name")
        .desc("The name of the DB from which to extract products")
        .hasArg().required()
        .longOpt("db-name")
    );
  }};

  private static final String HELP_MESSAGE = StringUtils.join(new String[] {
      "Extracts all products for reactions belonging ",
      "to organisms whose names match a given prefix",
  }, "");

  public static void main(String[] args) throws Exception {
    CLIUtil cliUtil = new CLIUtil(ProductExtractor.class, HELP_MESSAGE, OPTION_BUILDERS);
    CommandLine cl = cliUtil.parseCommandLine(args);

    String orgPrefix = cl.getOptionValue(OPTION_ORGANISM_PREFIX);
    LOGGER.info("Using organism prefix %s", orgPrefix);

    MongoDB db = new MongoDB(DEFAULT_DB_HOST, DEFAULT_DB_PORT, cl.getOptionValue(OPTION_DB_NAME));

    Map<Long, String> validOrganisms = new TreeMap<>();
    DBIterator orgIter = db.getDbIteratorOverOrgs();
    Organism o = null;
    while ((o = db.getNextOrganism(orgIter)) != null) {
      if (!o.getName().isEmpty() && o.getName().startsWith(orgPrefix)) {
        validOrganisms.put(o.getUUID(), o.getName());
      }
    }

    LOGGER.info("Found %d valid organisms", validOrganisms.size());

    Set<Long> productIds = new TreeSet<>(); // Use something with implicit ordering we can traverse in order.
    DBIterator reactionIterator = db.getIteratorOverReactions();
    Reaction r;
    while ((r = db.getNextReaction(reactionIterator)) != null) {
      Set<JSONObject> proteins = r.getProteinData();
      boolean valid = false;
      for (JSONObject j : proteins) {
        if (j.has("organism") && validOrganisms.containsKey(j.getLong("organism"))) {
          valid = true;
          break;
        } else if (j.has("organisms")) {
          JSONArray organisms = j.getJSONArray("organisms");
          for (int i = 0; i < organisms.length(); i++) {
            if (validOrganisms.containsKey(organisms.getLong(i))) {
              valid = true;
              break;
            }
          }
        }
      }

      if (valid) {
        for (Long id : r.getProducts()) {
          productIds.add(id);
        }
        for (Long id : r.getProductCofactors()) {
          productIds.add(id);
        }
      }
    }

    LOGGER.info("Found %d valid product ids for '%s'", productIds.size(), orgPrefix);
    PrintWriter writer = cl.hasOption(OPTION_OUTPUT_FILE) ?
        new PrintWriter(new FileWriter(cl.getOptionValue(OPTION_OUTPUT_FILE))) :
        new PrintWriter(System.out);

    for (Long id : productIds) {
      Chemical c = db.getChemicalFromChemicalUUID(id);
      String inchi = c.getInChI();
      if (inchi.startsWith("InChI=") && !inchi.startsWith("InChI=/FAKE")) {
        writer.println(inchi);
      }
    }

    if (cl.hasOption(OPTION_OUTPUT_FILE)) {
      writer.close();
    }
    LOGGER.info("Done.");
  }
}