/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package org.twentyn.proteintodna; import act.server.MongoDB; import act.shared.Chemical; import act.shared.Reaction; import act.shared.Seq; import com.act.biointerpretation.metadata.ProteinMetadata; import com.act.biointerpretation.metadata.RankPathway; import com.act.reachables.Cascade; import com.act.reachables.ReactionPath; import com.act.utils.CLIUtil; import com.mongodb.BasicDBObject; import com.mongodb.DB; import com.mongodb.DBObject; import com.mongodb.MongoClient; import com.mongodb.MongoInternalException; import com.mongodb.ServerAddress; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Option; import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.json.JSONArray; import org.json.JSONObject; import org.mongojack.DBCursor; import org.mongojack.DBQuery; import org.mongojack.JacksonDBCollection; import org.mongojack.WriteResult; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import java.util.stream.Collectors; public class ProteinToDNADriver { private static final Logger LOGGER = LogManager.getFormatterLogger(ProteinToDNADriver.class); private static final String DEFAULT_DB_HOST = "localhost"; private static final String DEFAULT_DB_PORT = "27017"; private static final String DEFAULT_OUTPUT_DB_NAME = "wiki_reachables"; private static final String DEFAULT_INPUT_DB_NAME = "SHOULD_COME_FROM_CMDLINE"; // "jarvis_2016-12-09"; public static final String DEFAULT_INPUT_PATHWAY_COLLECTION_NAME = "SHOULD_COME_FROM_CMDLINE"; // "pathways_jarvis"; public static final String DEFAULT_OUTPUT_PATHWAY_COLLECTION_NAME = "SHOULD_COME_FROM_CMDLINE"; // "pathways_vijay"; public static final String DEFAULT_OUTPUT_DNA_SEQ_COLLECTION_NAME = "SHOULD_COME_FROM_CMDLINE"; // "dna_designs"; private static final String OPTION_DB_HOST = "H"; private static final String OPTION_DB_PORT = "p"; private static final String OPTION_OUTPUT_DB_NAME = "o"; private static final String OPTION_INPUT_DB_NAME = "i"; private static final String OPTION_INPUT_PATHWAY_COLLECTION_NAME = "c"; private static final String OPTION_OUTPUT_PATHWAY_COLLECTION_NAME = "d"; private static final String OPTION_OUTPUT_DNA_SEQ_COLLECTION_NAME = "e"; private static final String OPTION_DESIGN_SOME = "m"; private static final Integer HIGHEST_SCORING_INFERRED_SEQ_INDEX = 0; private static final Set<String> BLACKLISTED_WORDS_IN_INFERRED_SEQ = new HashSet<>(Arrays.asList("Fragment")); private static final Pattern REGEX_ID = Pattern.compile("^\\d+$"); private static final Pattern REGEX_INCHI = Pattern.compile("^InChI=1S?/"); // Based on https://en.wikipedia.org/wiki/International_Chemical_Identifier#InChIKey private static final Pattern REGEX_INCHI_KEY = Pattern.compile("^[A-Z]{14}-[A-Z]{10}-[A-Z]$"); public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{ add(Option.builder(OPTION_DB_HOST) .argName("hostname") .desc(String.format("The DB host to which to connect (default: %s)", DEFAULT_DB_HOST)) .hasArg() .longOpt("db-host") ); add(Option.builder(OPTION_DB_PORT) .argName("port") .desc(String.format("The DB port to which to connect (default: %s)", DEFAULT_DB_PORT)) .hasArg() .longOpt("db-port") ); add(Option.builder(OPTION_OUTPUT_DB_NAME) .argName("output-db-name") .desc(String.format("The name of the database to read pathways and write seqs to (default: %s)", DEFAULT_OUTPUT_DB_NAME)) .hasArg() .longOpt("output-db-name") ); add(Option.builder(OPTION_INPUT_DB_NAME) .argName("input-db-name") .desc(String.format("The name of the database to read reactions from (default: %s)", DEFAULT_INPUT_DB_NAME)) .hasArg() .required() .longOpt("source-db-name") ); add(Option.builder(OPTION_INPUT_PATHWAY_COLLECTION_NAME) .argName("collection-name") .desc(String.format("The name of the input pathway collection to read from (default: %s)", DEFAULT_INPUT_PATHWAY_COLLECTION_NAME)) .hasArg() .required() .longOpt("input-pathway-collection") ); add(Option.builder(OPTION_OUTPUT_PATHWAY_COLLECTION_NAME) .argName("collection-name") .desc(String.format("The name of the output pathway collection to write to (default: %s)", DEFAULT_OUTPUT_PATHWAY_COLLECTION_NAME)) .hasArg() .required() .longOpt("output-pathway-collection") ); add(Option.builder(OPTION_OUTPUT_DNA_SEQ_COLLECTION_NAME) .argName("collection-name") .desc(String.format("The name of the output dna seq collection to write to (default: %s)", DEFAULT_OUTPUT_DNA_SEQ_COLLECTION_NAME)) .hasArg() .required() .longOpt("output-dna-seq-collection") ); add(Option.builder(OPTION_DESIGN_SOME) .argName("molecule") .desc("Generate designs for specified molecules; can be a numeric ids, InChIs, or InChI Keys, *separated by '|'* " + "for InChI compatibility. Molecules must exist in chemical source DB if InChI or InChI Key is used.") .hasArgs().valueSeparator('|') .longOpt("design-this") ); }}; public static final String HELP_MESSAGE = "This class is the driver to extract protein sequences from pathways and construct DNA designs from these proteins."; private static final CLIUtil CLI_UTIL = new CLIUtil(ProteinToDNADriver.class, HELP_MESSAGE, OPTION_BUILDERS); /** * This function gets all protein combinations of a pathway from candidate protein sequences from each reaction on * the pathway. * @param listOfSetOfProteinSequences A list of sets of candidate protein sequences in the pathway * @return A set of all possible combinations of proteins from all the reactions in the pathway. */ public static Set<List<String>> makePermutations(List<Set<String>> listOfSetOfProteinSequences) { Set<List<String>> accum = new HashSet<>(); makePermutationsHelper(accum, listOfSetOfProteinSequences, new ArrayList<>()); return accum; } private static void makePermutationsHelper(Set<List<String>> accum, List<Set<String>> input, List<String> prefix) { // Base case: no more sequences to add. Accumulate and return. if (input.isEmpty()) { accum.add(prefix); return; } // Recursive case: iterate through next level of input sequences, appending each to a prefix and recurring. Set<String> head = input.get(0); // Avoid index out of bounds exception. List<Set<String>> rest = input.size() > 1 ? input.subList(1, input.size()) : Collections.emptyList(); for (String next : head) { List<String> newPrefix = new ArrayList<>(prefix); newPrefix.add(next); makePermutationsHelper(accum, rest, newPrefix); } } public static void main(String[] args) throws Exception { CommandLine cl = CLI_UTIL.parseCommandLine(args); String reactionDbName = cl.getOptionValue(OPTION_INPUT_DB_NAME, DEFAULT_INPUT_DB_NAME); String dbHost = cl.getOptionValue(OPTION_DB_HOST, DEFAULT_DB_HOST); Integer dbPort = Integer.valueOf(cl.getOptionValue(OPTION_DB_PORT, DEFAULT_DB_PORT)); MongoDB reactionDB = new MongoDB(dbHost, dbPort, reactionDbName); MongoClient inputClient = new MongoClient(new ServerAddress(dbHost, dbPort)); String outDB = cl.getOptionValue(OPTION_OUTPUT_DB_NAME, DEFAULT_OUTPUT_DB_NAME); DB db = inputClient.getDB(outDB); String inputPathwaysCollectionName = cl.getOptionValue(OPTION_INPUT_PATHWAY_COLLECTION_NAME, DEFAULT_INPUT_PATHWAY_COLLECTION_NAME); String outputPathwaysCollectionName = cl.getOptionValue(OPTION_OUTPUT_PATHWAY_COLLECTION_NAME, DEFAULT_OUTPUT_PATHWAY_COLLECTION_NAME); String outputDnaDeqCollectionName = cl.getOptionValue(OPTION_OUTPUT_DNA_SEQ_COLLECTION_NAME, DEFAULT_OUTPUT_DNA_SEQ_COLLECTION_NAME); JacksonDBCollection<ReactionPath, String> inputPathwayCollection = JacksonDBCollection.wrap(db.getCollection(inputPathwaysCollectionName), ReactionPath.class, String.class); JacksonDBCollection<DNADesign, String> dnaDesignCollection = JacksonDBCollection.wrap(db.getCollection(outputDnaDeqCollectionName), DNADesign.class, String.class); JacksonDBCollection<ReactionPath, String> outputPathwayCollection = JacksonDBCollection.wrap(db.getCollection(outputPathwaysCollectionName), ReactionPath.class, String.class); Map<String, Set<ProteinInformation>> proteinSeqToOrgInfo = new HashMap<>(); ProteinsToDNA2 p2d = ProteinsToDNA2.initiate(); List<Long> reachableIds = cl.hasOption(OPTION_DESIGN_SOME) ? Arrays.stream(cl.getOptionValues(OPTION_DESIGN_SOME)). map(m -> lookupMolecule(reactionDB, m)).collect(Collectors.toList()) : Collections.emptyList(); // If there is a list of molecules for which to create designs, only consider their pathways. Otherwise, find all. DBObject query = reachableIds.isEmpty() ? new BasicDBObject() : DBQuery.in("target", reachableIds); /* Extract all pathway ids, then read pathways one id at a time. This will reduce the likelihood of the cursor * timing out, which has a tendency to happen when doing expensive operations before advancing (as done here). */ DBCursor<ReactionPath> cursor = inputPathwayCollection.find(query, new BasicDBObject("_id", true)); List<String> pathwayIds = new ArrayList<>(); while (cursor.hasNext()) { pathwayIds.add(cursor.next().getId()); } for (String pathwayId : pathwayIds) { ReactionPath reactionPath = inputPathwayCollection.findOne(DBQuery.is("_id", pathwayId)); List<List<Pair<ProteinMetadata, Integer>>> processedP = RankPathway.processSinglePathAsJava(reactionPath, reactionDbName, outDB); if (processedP == null) { LOGGER.info(String.format("Process pathway was filtered out possibly because there were more than %s seqs for a given pathway", RankPathway.MAX_PROTEINS_PER_PATH())); outputPathwayCollection.insert(reactionPath); continue; } Boolean atleastOneSeqMissingInPathway = false; List<Set<String>> proteinPaths = new ArrayList<>(); for (Cascade.NodeInformation nodeInformation : reactionPath.getPath().stream().filter(nodeInfo -> nodeInfo.getIsReaction()).collect(Collectors.toList())) { Set<String> proteinSeqs = new HashSet<>(); for (Long id : nodeInformation.getReactionIds()) { // If the id is negative, it is a reaction in the reverse direction. Moreover, the enzyme for this reverse // reaction is the same, so can use the actual positive reaction id's protein seq reference. // TODO: Add a preference for the positive forward direction compared to the negative backward direction seq. if (id < 0) { LOGGER.info("Found a negative reaction id", id); id = Reaction.reverseID(id); } Reaction reaction = reactionDB.getReactionFromUUID(id); for (JSONObject data : reaction.getProteinData()) { // Get the sequences if (data.has("sequences")) { JSONArray seqs = data.getJSONArray("sequences"); for (int i = 0; i < seqs.length(); i++) { Long s = seqs.getLong(i); if (s != null) { Seq sequenceInfo = reactionDB.getSeqFromID(s); String dnaSeq = sequenceInfo.getSequence(); if (dnaSeq == null) { LOGGER.info(String.format("Sequence string for seq id %d, reaction id %d and reaction path %s is null", s, id, reactionPath.getId())); continue; } // odd sequence if (dnaSeq.length() <= 80 || dnaSeq.charAt(0) != 'M') { JSONObject metadata = sequenceInfo.getMetadata(); if (!metadata.has("inferred_sequences") || metadata.getJSONArray("inferred_sequences").length() == 0) { continue; } JSONArray inferredSequences = metadata.getJSONArray("inferred_sequences"); // get the first inferred sequence since it has the highest hmmer score JSONObject object = inferredSequences.getJSONObject(HIGHEST_SCORING_INFERRED_SEQ_INDEX); for (String blacklistWord : BLACKLISTED_WORDS_IN_INFERRED_SEQ) { if (object.getString("fasta_header").contains(blacklistWord)) { continue; } } dnaSeq = object.getString("sequence"); } proteinSeqs.add(dnaSeq); ProteinInformation proteinInformation = new ProteinInformation(sequenceInfo.getOrgName(), sequenceInfo.getEc(), sequenceInfo.getSequence(), reaction.getReactionName()); if (!proteinSeqToOrgInfo.containsKey(dnaSeq)) { proteinSeqToOrgInfo.put(dnaSeq, new HashSet<>()); } proteinSeqToOrgInfo.get(dnaSeq).add(proteinInformation); } } } } } if (proteinSeqs.size() == 0) { LOGGER.info("The reaction does not have any viable protein sequences"); atleastOneSeqMissingInPathway = true; break; } // Now we select two representative protein seqs from the reaction. In order to do this deterministically, // we sort and pick the first and middle index protein seqs. List<String> proteinSeqArray = new ArrayList<>(proteinSeqs); Collections.sort(proteinSeqArray); int firstIndex = 0; int middleIndex = proteinSeqs.size() / 2; // get first seq Set<String> combination = new HashSet<>(); combination.add(proteinSeqArray.get(firstIndex)); // get middle index of the protein seq array if (proteinSeqs.size() > 1) { combination.add(proteinSeqArray.get(middleIndex)); } proteinPaths.add(combination); } if (atleastOneSeqMissingInPathway) { LOGGER.info(String.format("There is at least one reaction with no sequence in reaction path id: %s", reactionPath.getId())); } else { LOGGER.info(String.format("All reactions in reaction path have at least one viable seq: %s", reactionPath.getId())); // We only compute the dna design if we can find at least one sequence for each reaction in the pathway. Set<List<String>> pathwayProteinCombinations = makePermutations(proteinPaths); Set<DNAOrgECNum> dnaDesigns = new HashSet<>(); for (List<String> proteinsInPathway : pathwayProteinCombinations) { try { Construct dna = p2d.computeDNA(proteinsInPathway, Host.Ecoli); List<Set<ProteinInformation>> seqMetadata = new ArrayList<>(); for (String protein : proteinsInPathway) { seqMetadata.add(proteinSeqToOrgInfo.get(protein)); } DNAOrgECNum instance = new DNAOrgECNum(dna.toSeq(), seqMetadata, proteinsInPathway.size()); dnaDesigns.add(instance); } catch (Exception ex) { LOGGER.error("The error thrown while trying to call computeDNA", ex.getMessage()); } } DNADesign dnaDesignSeq = new DNADesign(dnaDesigns); try { WriteResult<DNADesign, String> result = dnaDesignCollection.insert(dnaDesignSeq); String id = result.getSavedId(); reactionPath.setDnaDesignRef(id); } catch (MongoInternalException e) { // This condition happens whent the protein designs are too big and cannot be inserted in to the JSON object. // TODO: Handle this case without dropping the record LOGGER.error(String.format("Mongo internal exception caught while inserting dna design: %s", e.getMessage())); } } outputPathwayCollection.insert(reactionPath); } } private static Long lookupMolecule(MongoDB db, String someKey) { if (REGEX_ID.matcher(someKey).find()) { // Note: this doesn't verify that the chemical id is valid. Maybe we should do that? return Long.valueOf(someKey); } else if (REGEX_INCHI.matcher(someKey).find()) { Chemical c = db.getChemicalFromInChI(someKey); if (c != null) { return c.getUuid(); } } else if (REGEX_INCHI_KEY.matcher(someKey).find()) { Chemical c = db.getChemicalFromInChIKey(someKey); if (c != null) { return c.getUuid(); } } else { String msg = String.format("Unable to find key type for query '%s'", someKey); LOGGER.error(msg); throw new IllegalArgumentException(msg); } String msg = String.format("Unable to find matching chemical for query %s", someKey); LOGGER.error(msg); throw new IllegalArgumentException(msg); } }