/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package org.twentyn.proteintodna;
import act.server.MongoDB;
import act.shared.Chemical;
import act.shared.Reaction;
import act.shared.Seq;
import com.act.biointerpretation.metadata.ProteinMetadata;
import com.act.biointerpretation.metadata.RankPathway;
import com.act.reachables.Cascade;
import com.act.reachables.ReactionPath;
import com.act.utils.CLIUtil;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBObject;
import com.mongodb.MongoClient;
import com.mongodb.MongoInternalException;
import com.mongodb.ServerAddress;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONObject;
import org.mongojack.DBCursor;
import org.mongojack.DBQuery;
import org.mongojack.JacksonDBCollection;
import org.mongojack.WriteResult;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class ProteinToDNADriver {
private static final Logger LOGGER = LogManager.getFormatterLogger(ProteinToDNADriver.class);
private static final String DEFAULT_DB_HOST = "localhost";
private static final String DEFAULT_DB_PORT = "27017";
private static final String DEFAULT_OUTPUT_DB_NAME = "wiki_reachables";
private static final String DEFAULT_INPUT_DB_NAME = "SHOULD_COME_FROM_CMDLINE"; // "jarvis_2016-12-09";
public static final String DEFAULT_INPUT_PATHWAY_COLLECTION_NAME = "SHOULD_COME_FROM_CMDLINE"; // "pathways_jarvis";
public static final String DEFAULT_OUTPUT_PATHWAY_COLLECTION_NAME = "SHOULD_COME_FROM_CMDLINE"; // "pathways_vijay";
public static final String DEFAULT_OUTPUT_DNA_SEQ_COLLECTION_NAME = "SHOULD_COME_FROM_CMDLINE"; // "dna_designs";
private static final String OPTION_DB_HOST = "H";
private static final String OPTION_DB_PORT = "p";
private static final String OPTION_OUTPUT_DB_NAME = "o";
private static final String OPTION_INPUT_DB_NAME = "i";
private static final String OPTION_INPUT_PATHWAY_COLLECTION_NAME = "c";
private static final String OPTION_OUTPUT_PATHWAY_COLLECTION_NAME = "d";
private static final String OPTION_OUTPUT_DNA_SEQ_COLLECTION_NAME = "e";
private static final String OPTION_DESIGN_SOME = "m";
private static final Integer HIGHEST_SCORING_INFERRED_SEQ_INDEX = 0;
private static final Set<String> BLACKLISTED_WORDS_IN_INFERRED_SEQ = new HashSet<>(Arrays.asList("Fragment"));
private static final Pattern REGEX_ID = Pattern.compile("^\\d+$");
private static final Pattern REGEX_INCHI = Pattern.compile("^InChI=1S?/");
// Based on https://en.wikipedia.org/wiki/International_Chemical_Identifier#InChIKey
private static final Pattern REGEX_INCHI_KEY = Pattern.compile("^[A-Z]{14}-[A-Z]{10}-[A-Z]$");
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_DB_HOST)
.argName("hostname")
.desc(String.format("The DB host to which to connect (default: %s)", DEFAULT_DB_HOST))
.hasArg()
.longOpt("db-host")
);
add(Option.builder(OPTION_DB_PORT)
.argName("port")
.desc(String.format("The DB port to which to connect (default: %s)", DEFAULT_DB_PORT))
.hasArg()
.longOpt("db-port")
);
add(Option.builder(OPTION_OUTPUT_DB_NAME)
.argName("output-db-name")
.desc(String.format("The name of the database to read pathways and write seqs to (default: %s)", DEFAULT_OUTPUT_DB_NAME))
.hasArg()
.longOpt("output-db-name")
);
add(Option.builder(OPTION_INPUT_DB_NAME)
.argName("input-db-name")
.desc(String.format("The name of the database to read reactions from (default: %s)", DEFAULT_INPUT_DB_NAME))
.hasArg()
.required()
.longOpt("source-db-name")
);
add(Option.builder(OPTION_INPUT_PATHWAY_COLLECTION_NAME)
.argName("collection-name")
.desc(String.format("The name of the input pathway collection to read from (default: %s)", DEFAULT_INPUT_PATHWAY_COLLECTION_NAME))
.hasArg()
.required()
.longOpt("input-pathway-collection")
);
add(Option.builder(OPTION_OUTPUT_PATHWAY_COLLECTION_NAME)
.argName("collection-name")
.desc(String.format("The name of the output pathway collection to write to (default: %s)", DEFAULT_OUTPUT_PATHWAY_COLLECTION_NAME))
.hasArg()
.required()
.longOpt("output-pathway-collection")
);
add(Option.builder(OPTION_OUTPUT_DNA_SEQ_COLLECTION_NAME)
.argName("collection-name")
.desc(String.format("The name of the output dna seq collection to write to (default: %s)", DEFAULT_OUTPUT_DNA_SEQ_COLLECTION_NAME))
.hasArg()
.required()
.longOpt("output-dna-seq-collection")
);
add(Option.builder(OPTION_DESIGN_SOME)
.argName("molecule")
.desc("Generate designs for specified molecules; can be a numeric ids, InChIs, or InChI Keys, *separated by '|'* " +
"for InChI compatibility. Molecules must exist in chemical source DB if InChI or InChI Key is used.")
.hasArgs().valueSeparator('|')
.longOpt("design-this")
);
}};
public static final String HELP_MESSAGE =
"This class is the driver to extract protein sequences from pathways and construct DNA designs from these proteins.";
private static final CLIUtil CLI_UTIL = new CLIUtil(ProteinToDNADriver.class, HELP_MESSAGE, OPTION_BUILDERS);
/**
* This function gets all protein combinations of a pathway from candidate protein sequences from each reaction on
* the pathway.
* @param listOfSetOfProteinSequences A list of sets of candidate protein sequences in the pathway
* @return A set of all possible combinations of proteins from all the reactions in the pathway.
*/
public static Set<List<String>> makePermutations(List<Set<String>> listOfSetOfProteinSequences) {
Set<List<String>> accum = new HashSet<>();
makePermutationsHelper(accum, listOfSetOfProteinSequences, new ArrayList<>());
return accum;
}
private static void makePermutationsHelper(Set<List<String>> accum, List<Set<String>> input, List<String> prefix) {
// Base case: no more sequences to add. Accumulate and return.
if (input.isEmpty()) {
accum.add(prefix);
return;
}
// Recursive case: iterate through next level of input sequences, appending each to a prefix and recurring.
Set<String> head = input.get(0);
// Avoid index out of bounds exception.
List<Set<String>> rest = input.size() > 1 ? input.subList(1, input.size()) : Collections.emptyList();
for (String next : head) {
List<String> newPrefix = new ArrayList<>(prefix);
newPrefix.add(next);
makePermutationsHelper(accum, rest, newPrefix);
}
}
public static void main(String[] args) throws Exception {
CommandLine cl = CLI_UTIL.parseCommandLine(args);
String reactionDbName = cl.getOptionValue(OPTION_INPUT_DB_NAME, DEFAULT_INPUT_DB_NAME);
String dbHost = cl.getOptionValue(OPTION_DB_HOST, DEFAULT_DB_HOST);
Integer dbPort = Integer.valueOf(cl.getOptionValue(OPTION_DB_PORT, DEFAULT_DB_PORT));
MongoDB reactionDB = new MongoDB(dbHost, dbPort, reactionDbName);
MongoClient inputClient = new MongoClient(new ServerAddress(dbHost, dbPort));
String outDB = cl.getOptionValue(OPTION_OUTPUT_DB_NAME, DEFAULT_OUTPUT_DB_NAME);
DB db = inputClient.getDB(outDB);
String inputPathwaysCollectionName = cl.getOptionValue(OPTION_INPUT_PATHWAY_COLLECTION_NAME, DEFAULT_INPUT_PATHWAY_COLLECTION_NAME);
String outputPathwaysCollectionName = cl.getOptionValue(OPTION_OUTPUT_PATHWAY_COLLECTION_NAME, DEFAULT_OUTPUT_PATHWAY_COLLECTION_NAME);
String outputDnaDeqCollectionName = cl.getOptionValue(OPTION_OUTPUT_DNA_SEQ_COLLECTION_NAME, DEFAULT_OUTPUT_DNA_SEQ_COLLECTION_NAME);
JacksonDBCollection<ReactionPath, String> inputPathwayCollection = JacksonDBCollection.wrap(db.getCollection(inputPathwaysCollectionName), ReactionPath.class, String.class);
JacksonDBCollection<DNADesign, String> dnaDesignCollection = JacksonDBCollection.wrap(db.getCollection(outputDnaDeqCollectionName), DNADesign.class, String.class);
JacksonDBCollection<ReactionPath, String> outputPathwayCollection = JacksonDBCollection.wrap(db.getCollection(outputPathwaysCollectionName), ReactionPath.class, String.class);
Map<String, Set<ProteinInformation>> proteinSeqToOrgInfo = new HashMap<>();
ProteinsToDNA2 p2d = ProteinsToDNA2.initiate();
List<Long> reachableIds = cl.hasOption(OPTION_DESIGN_SOME) ?
Arrays.stream(cl.getOptionValues(OPTION_DESIGN_SOME)).
map(m -> lookupMolecule(reactionDB, m)).collect(Collectors.toList()) :
Collections.emptyList();
// If there is a list of molecules for which to create designs, only consider their pathways. Otherwise, find all.
DBObject query = reachableIds.isEmpty() ? new BasicDBObject() : DBQuery.in("target", reachableIds);
/* Extract all pathway ids, then read pathways one id at a time. This will reduce the likelihood of the cursor
* timing out, which has a tendency to happen when doing expensive operations before advancing (as done here). */
DBCursor<ReactionPath> cursor = inputPathwayCollection.find(query, new BasicDBObject("_id", true));
List<String> pathwayIds = new ArrayList<>();
while (cursor.hasNext()) {
pathwayIds.add(cursor.next().getId());
}
for (String pathwayId : pathwayIds) {
ReactionPath reactionPath = inputPathwayCollection.findOne(DBQuery.is("_id", pathwayId));
List<List<Pair<ProteinMetadata, Integer>>> processedP = RankPathway.processSinglePathAsJava(reactionPath, reactionDbName, outDB);
if (processedP == null) {
LOGGER.info(String.format("Process pathway was filtered out possibly because there were more than %s seqs for a given pathway",
RankPathway.MAX_PROTEINS_PER_PATH()));
outputPathwayCollection.insert(reactionPath);
continue;
}
Boolean atleastOneSeqMissingInPathway = false;
List<Set<String>> proteinPaths = new ArrayList<>();
for (Cascade.NodeInformation nodeInformation :
reactionPath.getPath().stream().filter(nodeInfo -> nodeInfo.getIsReaction()).collect(Collectors.toList())) {
Set<String> proteinSeqs = new HashSet<>();
for (Long id : nodeInformation.getReactionIds()) {
// If the id is negative, it is a reaction in the reverse direction. Moreover, the enzyme for this reverse
// reaction is the same, so can use the actual positive reaction id's protein seq reference.
// TODO: Add a preference for the positive forward direction compared to the negative backward direction seq.
if (id < 0) {
LOGGER.info("Found a negative reaction id", id);
id = Reaction.reverseID(id);
}
Reaction reaction = reactionDB.getReactionFromUUID(id);
for (JSONObject data : reaction.getProteinData()) {
// Get the sequences
if (data.has("sequences")) {
JSONArray seqs = data.getJSONArray("sequences");
for (int i = 0; i < seqs.length(); i++) {
Long s = seqs.getLong(i);
if (s != null) {
Seq sequenceInfo = reactionDB.getSeqFromID(s);
String dnaSeq = sequenceInfo.getSequence();
if (dnaSeq == null) {
LOGGER.info(String.format("Sequence string for seq id %d, reaction id %d and reaction path %s is null",
s, id, reactionPath.getId()));
continue;
}
// odd sequence
if (dnaSeq.length() <= 80 || dnaSeq.charAt(0) != 'M') {
JSONObject metadata = sequenceInfo.getMetadata();
if (!metadata.has("inferred_sequences") || metadata.getJSONArray("inferred_sequences").length() == 0) {
continue;
}
JSONArray inferredSequences = metadata.getJSONArray("inferred_sequences");
// get the first inferred sequence since it has the highest hmmer score
JSONObject object = inferredSequences.getJSONObject(HIGHEST_SCORING_INFERRED_SEQ_INDEX);
for (String blacklistWord : BLACKLISTED_WORDS_IN_INFERRED_SEQ) {
if (object.getString("fasta_header").contains(blacklistWord)) {
continue;
}
}
dnaSeq = object.getString("sequence");
}
proteinSeqs.add(dnaSeq);
ProteinInformation proteinInformation = new ProteinInformation(sequenceInfo.getOrgName(), sequenceInfo.getEc(),
sequenceInfo.getSequence(), reaction.getReactionName());
if (!proteinSeqToOrgInfo.containsKey(dnaSeq)) {
proteinSeqToOrgInfo.put(dnaSeq, new HashSet<>());
}
proteinSeqToOrgInfo.get(dnaSeq).add(proteinInformation);
}
}
}
}
}
if (proteinSeqs.size() == 0) {
LOGGER.info("The reaction does not have any viable protein sequences");
atleastOneSeqMissingInPathway = true;
break;
}
// Now we select two representative protein seqs from the reaction. In order to do this deterministically,
// we sort and pick the first and middle index protein seqs.
List<String> proteinSeqArray = new ArrayList<>(proteinSeqs);
Collections.sort(proteinSeqArray);
int firstIndex = 0;
int middleIndex = proteinSeqs.size() / 2;
// get first seq
Set<String> combination = new HashSet<>();
combination.add(proteinSeqArray.get(firstIndex));
// get middle index of the protein seq array
if (proteinSeqs.size() > 1) {
combination.add(proteinSeqArray.get(middleIndex));
}
proteinPaths.add(combination);
}
if (atleastOneSeqMissingInPathway) {
LOGGER.info(String.format("There is at least one reaction with no sequence in reaction path id: %s", reactionPath.getId()));
} else {
LOGGER.info(String.format("All reactions in reaction path have at least one viable seq: %s", reactionPath.getId()));
// We only compute the dna design if we can find at least one sequence for each reaction in the pathway.
Set<List<String>> pathwayProteinCombinations = makePermutations(proteinPaths);
Set<DNAOrgECNum> dnaDesigns = new HashSet<>();
for (List<String> proteinsInPathway : pathwayProteinCombinations) {
try {
Construct dna = p2d.computeDNA(proteinsInPathway, Host.Ecoli);
List<Set<ProteinInformation>> seqMetadata = new ArrayList<>();
for (String protein : proteinsInPathway) {
seqMetadata.add(proteinSeqToOrgInfo.get(protein));
}
DNAOrgECNum instance = new DNAOrgECNum(dna.toSeq(), seqMetadata, proteinsInPathway.size());
dnaDesigns.add(instance);
} catch (Exception ex) {
LOGGER.error("The error thrown while trying to call computeDNA", ex.getMessage());
}
}
DNADesign dnaDesignSeq = new DNADesign(dnaDesigns);
try {
WriteResult<DNADesign, String> result = dnaDesignCollection.insert(dnaDesignSeq);
String id = result.getSavedId();
reactionPath.setDnaDesignRef(id);
} catch (MongoInternalException e) {
// This condition happens whent the protein designs are too big and cannot be inserted in to the JSON object.
// TODO: Handle this case without dropping the record
LOGGER.error(String.format("Mongo internal exception caught while inserting dna design: %s", e.getMessage()));
}
}
outputPathwayCollection.insert(reactionPath);
}
}
private static Long lookupMolecule(MongoDB db, String someKey) {
if (REGEX_ID.matcher(someKey).find()) {
// Note: this doesn't verify that the chemical id is valid. Maybe we should do that?
return Long.valueOf(someKey);
} else if (REGEX_INCHI.matcher(someKey).find()) {
Chemical c = db.getChemicalFromInChI(someKey);
if (c != null) {
return c.getUuid();
}
} else if (REGEX_INCHI_KEY.matcher(someKey).find()) {
Chemical c = db.getChemicalFromInChIKey(someKey);
if (c != null) {
return c.getUuid();
}
} else {
String msg = String.format("Unable to find key type for query '%s'", someKey);
LOGGER.error(msg);
throw new IllegalArgumentException(msg);
}
String msg = String.format("Unable to find matching chemical for query %s", someKey);
LOGGER.error(msg);
throw new IllegalArgumentException(msg);
}
}