/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.reachables;
import act.installer.bing.BingSearchRanker;
import act.server.NoSQLAPI;
import act.shared.Reaction;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class ConditionalReachabilityInterpreter {
private static final String GLUCOSE_INCHI = "InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1";
private static final String ATP_INCHI = "InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(17)6(16)4(26-10)1-25-30(21,22)28-31(23,24)27-29(18,19)20/h2-4,6-7,10,16-17H,1H2,(H,21,22)(H,23,24)(H2,11,12,13)(H2,18,19,20)/t4-,6-,7-,10-/m1/s1";
private static final Set<String> BLACKLISTED_ROOT_INCHIS = new HashSet<String>() {{
add(GLUCOSE_INCHI);
add(ATP_INCHI);
}};
public static final String OPTION_OUTPUT_FILEPATH = "o";
public static final String OPTION_INPUT_ACT_FILEPATH = "i";
public static final String OPTION_DB_NAME = "d";
private static final Logger LOGGER = LogManager.getFormatterLogger(ConditionalReachabilityInterpreter.class);
public static final String HELP_MESSAGE = StringUtils.join(new String[]{
"This class is used to deserialize a reachable forest and output bing search results of all chemicals within each root",
"of the forest along with it's root associate."
}, " ");
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_OUTPUT_FILEPATH)
.argName("OUTPUT_FILEPATH")
.desc("The full path to the output file")
.hasArg()
.required()
.longOpt("output_filepath")
.type(String.class)
);
add(Option.builder(OPTION_INPUT_ACT_FILEPATH)
.argName("INPUT_ACT_FILEPATH")
.desc("The full path to the input act file")
.hasArg()
.required()
.longOpt("input_act_filepath")
.type(String.class)
);
add(Option.builder(OPTION_DB_NAME)
.argName("DB_NAME")
.desc("The name of the database")
.hasArg()
.required()
.longOpt("db_name")
.type(String.class)
);
add(Option.builder("h")
.argName("help")
.desc("Prints this help message")
.longOpt("help")
);
}};
public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
static {
HELP_FORMATTER.setWidth(100);
}
// Instance variables
private ActData actData;
private Set<Long> rootChemicals;
private Map<Long, String> chemIdToInchi;
private Map<String, Integer> depthOfMolecule;
private NoSQLAPI db = new NoSQLAPI("marvin", "marvin");
public ConditionalReachabilityInterpreter(ActData actData, NoSQLAPI db) {
this.actData = actData;
this.rootChemicals = new HashSet<>();
this.chemIdToInchi = new HashMap<>();
this.depthOfMolecule = new HashMap<>();
this.db = db;
}
public static void main(String[] args) throws Exception {
// Parse the command line options
Options opts = new Options();
for (Option.Builder b : OPTION_BUILDERS) {
opts.addOption(b.build());
}
CommandLine cl = null;
try {
CommandLineParser parser = new DefaultParser();
cl = parser.parse(opts, args);
} catch (ParseException e) {
System.err.format("Argument parsing failed: %s\n", e.getMessage());
HELP_FORMATTER.printHelp(BingSearchRanker.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
if (cl.hasOption("help")) {
HELP_FORMATTER.printHelp(BingSearchRanker.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
return;
}
String inputPath = cl.getOptionValue(OPTION_INPUT_ACT_FILEPATH);
String outputPath = cl.getOptionValue(OPTION_OUTPUT_FILEPATH);
String dbName = cl.getOptionValue(OPTION_DB_NAME);
LOGGER.info("Starting to deserialize reachables forest.");
ActData.instance().deserialize(inputPath);
ActData actData = ActData.instance();
LOGGER.info("Finished deserializing reachables forest.");
NoSQLAPI db = new NoSQLAPI(dbName, dbName);
ConditionalReachabilityInterpreter conditionalReachabilityInterpreter =
new ConditionalReachabilityInterpreter(actData, db);
conditionalReachabilityInterpreter.run(outputPath);
}
/**
* This function constructs parent to children associations, while finding root chemicals from the reachables forest.
* @return parent to child associations
*/
private Map<Long, Set<Long>> constructParentToChildAssociationsAndPopulateRootChemicals() {
Map<Long, Set<Long>> parentToChildrenAssociations = new HashMap<>();
for (Map.Entry<Long, Long> childIdToParentId : this.actData.getActTree().parents.entrySet()) {
Long parentId = childIdToParentId.getValue();
Long childId = childIdToParentId.getKey();
// If the parentId is null, that means the node is one of the roots of the forest.
if (parentId == null) {
rootChemicals.add(childId);
continue;
}
Set<Long> childIds = parentToChildrenAssociations.get(parentId);
if (childIds == null) {
childIds = new HashSet<>();
parentToChildrenAssociations.put(parentId, childIds);
}
childIds.add(childId);
}
return parentToChildrenAssociations;
}
/**
* This function constructs root to descendant mappings, creating a representation of the forest that is easy to
* traverse.
* @param parentToDescendantsAssociations A mapping between parent id to a set of all it's children.
* @return a mapping of root id to all its descendant ids.
*/
private Map<Long, Set<Long>> constructRootToDescendantMappings(Map<Long, Set<Long>> parentToDescendantsAssociations) {
Map<Long, Set<Long>> rootToSetOfDescendants = new HashMap<>();
for (Long rootId : rootChemicals) {
// Record depth of each tree
int depth = 1;
String rootInchi = db.readChemicalFromInKnowledgeGraph(rootId < 0 ? Reaction.reverseNegativeId(rootId) : rootId).getInChI();
chemIdToInchi.put(rootId, rootInchi);
Set<Long> children = parentToDescendantsAssociations.get(rootId);
while (children != null && children.size() > 0) {
Set<Long> descendants = rootToSetOfDescendants.get(rootId);
if (descendants == null) {
descendants = new HashSet<>();
rootToSetOfDescendants.put(rootId, descendants);
}
descendants.addAll(children);
/**
* Record depth for each member of children and construct a Set newChildren which is the set of all children
* of the variable children.
*/
Set<Long> newChildren = new HashSet<>();
for (Long child : children) {
String childInchi = chemIdToInchi.get(child);
if (childInchi == null) {
childInchi = db.readChemicalFromInKnowledgeGraph(child < 0 ? Reaction.reverseNegativeId(child) : child).getInChI();
chemIdToInchi.put(child, childInchi);
}
// Since a child is only associated with one parent, we can simply record it's depth from that root without
// worrying about possible collisions with other roots as parents as none exist.
depthOfMolecule.put(childInchi, depth);
Set<Long> childrenOfChil = parentToDescendantsAssociations.get(child);
if (childrenOfChil != null) {
newChildren.addAll(childrenOfChil);
}
}
children = newChildren;
depth++;
}
}
return rootToSetOfDescendants;
}
/**
* This function constructs the conditional reachability forest, from each root to its descendants, and passes that
* structure to the bing search results of chemical ranking. Based on the ranking, we output a tsv file for each
* molecule that is conditionally reachable, its root and bing search metadata.
* @param outputFilePath The output file to write to
* @throws IOException
*/
private void run(String outputFilePath) throws IOException {
LOGGER.info("Create parent to child associations");
Map<Long, Set<Long>> parentToChildrenAssociations = constructParentToChildAssociationsAndPopulateRootChemicals();
LOGGER.info("Construct trees from the root chemicals");
Map<Long, Set<Long>> rootToSetOfDescendants = constructRootToDescendantMappings(parentToChildrenAssociations);
LOGGER.info("Construct reverse mapping from descendant to root chemical");
Map<String, String> descendantInchiToRootInchi = new HashMap<>();
for (Map.Entry<Long, Set<Long>> entry : rootToSetOfDescendants.entrySet()) {
String rootInchi = chemIdToInchi.get(entry.getKey());
if (BLACKLISTED_ROOT_INCHIS.contains(rootInchi)) {
continue;
}
for (Long descendant : entry.getValue()) {
// Since a chemical is only added as a child to one specific root, there is not chance for collisions to happen.
descendantInchiToRootInchi.put(chemIdToInchi.get(descendant), rootInchi);
}
}
Set<String> allInchis = new HashSet<>();
allInchis.addAll(chemIdToInchi.values());
LOGGER.info("Add chemicals to bing search results");
// Update the Bing Search results in the Installer database
BingSearchRanker bingSearchRanker = new BingSearchRanker();
bingSearchRanker.addBingSearchResults(allInchis);
LOGGER.info("Write chemicals to output file");
bingSearchRanker.writeBingSearchRanksAsTSVUsingConditionalReachabilityFormat(
allInchis,
descendantInchiToRootInchi,
depthOfMolecule,
outputFilePath);
}
}