/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.reachables;
import act.server.DBIterator;
import act.server.MongoDB;
import act.shared.Chemical;
import act.shared.Chemical.REFS;
import act.shared.Reaction;
import com.mongodb.BasicDBObject;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.json.JSONArray;
import org.json.JSONObject;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class LoadAct extends SteppedTask {
// Constants
private static String _fileloc = "com.act.reachables.LoadAct";
private static final boolean SET_METADATA_ON_NW_NODES = false;
private static final String DEFAULT_DB_HOST = "localhost";
private static final int DEFAULT_PORT = 27017;
// Fields
private MongoDB db;
private Set<String> optional_universal_inchis;
private Set<String> optional_cofactor_inchis;
private int loaded, total;
private List<String> fieldSetForChemicals;
private LoadAct(String dbToUse, Set<String> optional_universal_inchis, Set<String> optional_cofactor_inchis) {
this.optional_universal_inchis = optional_universal_inchis;
this.optional_cofactor_inchis = optional_cofactor_inchis;
this.fieldSetForChemicals = new ArrayList<>();
this.db = new MongoDB(DEFAULT_DB_HOST, DEFAULT_PORT, dbToUse);
ActData.instance().Act = new Network("Act");
ActData.instance().ActTree = new Network("Act Tree");
// the following take time, so will be done in init();
ActData.instance().allrxnids = null;
ActData.instance().cofactors = null;
ActData.instance().natives = null;
ActData.instance().chemInchis = null;
ActData.instance().chemId2Inchis = null;
ActData.instance().chemId2ReadableName = null;
ActData.instance().chemToxicity = null;
ActData.instance().chemsReferencedInRxns = null;
GlobalParams._hostOrganismIDs = new Long[GlobalParams._hostOrganisms.length];
for (int i = 0; i<GlobalParams._hostOrganisms.length; i++) {
GlobalParams._hostOrganismIDs[i] = this.db.getOrganismId(GlobalParams._hostOrganisms[i]);
}
total = -1;
loaded = -1;
}
public static Network getReachablesTree(String dbToUse, Set<String> natives, Set<String> cofactors, boolean restrictToSeq, String[] extra_chem_fields) {
GlobalParams._actTreeOnlyIncludeRxnsWithSequences = restrictToSeq;
// init loader
LoadAct act = new LoadAct(dbToUse, natives, cofactors);
// set fields to include in the tree even if they are not reachables
if (extra_chem_fields != null)
for (String f : extra_chem_fields)
act.setFieldForExtraChemicals(f);
// load data from db; and compute reachables tree
act.run();
return ActData.instance().ActTree;
}
public static Network getReachablesTree(String dbToUse, Set<String> natives, Set<String> cofactors, boolean restrictToSeq) {
return getReachablesTree(dbToUse, natives, cofactors, restrictToSeq, null);
}
public static Network getReachablesTree(String dbToUse, Set<String> natives, Set<String> cofactors) {
return getReachablesTree(dbToUse, natives, cofactors, true);
}
public static String toInChI(Long id) {
return ActData.instance().chemId2Inchis.get(id);
}
private static void addToNw(Reaction rxn) {
/**
* This class takes in a reaction and adds it to ActData.
*
* Special handling is afforded to abstract reactions and reactions w/ only cofactors as substrates.
*/
Long rxnid = (long) rxn.getUUID();
// Filter out abstract substrates. These reactions are invalid
Long[] inputChemicalsArray = ArrayUtils.addAll(rxn.getSubstrates(), rxn.getSubstrateCofactors());
// Some reactions have coenzymes, so don't forget about those.
inputChemicalsArray = ArrayUtils.addAll(inputChemicalsArray, rxn.getCoenzymes());
if (Arrays.stream(inputChemicalsArray).anyMatch(x -> ActData.instance().metaCycBigMolsOrRgrp.contains(x))){
logProgress(String.format("Skipping reaction %d as it contains an abstract substrate.", rxnid));
return;
}
// Remove cofactors from the needed substrates.
List<Long> inputChemicals = Arrays.asList(inputChemicalsArray).stream()
.filter(c -> !ActData.instance().cofactors.contains(c))
.collect(Collectors.toList());
Long[] outputChemicalsArray = ArrayUtils.addAll(rxn.getProducts(), rxn.getProductCofactors());
// Output chemicals, only real ones. Also remove all of the cofactors as we don't care if those are produced.
List<Long> outputChemicals = Arrays.asList(outputChemicalsArray).stream()
.filter(p -> !ActData.instance().metaCycBigMolsOrRgrp.contains(p) && !ActData.instance().cofactors.contains(p))
.collect(Collectors.toList());
// Don't bother if we don't get any new chemicals from this.
if (outputChemicals.isEmpty()){
logProgress(String.format("Skipping reaction %d as it has no non-abstract products.", rxnid));
return;
}
if (inputChemicals.isEmpty()) {
logProgress(String.format("Undoing filtering on reaction %d substrates as it has only cofactor substrates.", rxnid));
// These will be substrate only
inputChemicals = Arrays.asList(inputChemicalsArray);
}
// Load all the nodeMapping.
inputChemicals.stream().distinct().forEach(s -> {
Node sub = Node.get(s, true);
ActData.instance().chemsInAct.put(s, sub);
ActData.instance().Act.addNode(sub, s);
outputChemicals.stream().distinct().forEach(p -> {
Node prd = Node.get(p, true);
ActData.instance().Act.addNode(prd, p);
ActData.instance().chemsInAct.put(p, prd);
Edge r = Edge.get(sub, prd, true);
ActData.instance().Act.addEdge(r);
});
});
Set<Long> inputSet = new HashSet<>(inputChemicals);
Set<Long> outputSet = new HashSet<>(outputChemicals);
/* --------------- Update ActData ------------------ */
ActData.instance().rxnSubstrates.put(rxnid, inputSet);
ActData.instance().rxnProducts.put(rxnid, outputSet);
inputSet.stream().forEach(ActData.instance().chemsReferencedInRxns::add);
outputSet.stream().forEach(ActData.instance().chemsReferencedInRxns::add);
inputSet.stream().forEach(s -> {
Set<Long> pSet = ActData.instance().rxnsThatConsumeChem.getOrDefault(s, new HashSet<>());
pSet.add(rxnid);
ActData.instance().rxnsThatConsumeChem.put(s, pSet);
});
outputSet.stream().forEach(p -> {
Set<Long> pSet = ActData.instance().rxnsThatProduceChem.getOrDefault(p, new HashSet<>());
pSet.add(rxnid);
ActData.instance().rxnsThatProduceChem.put(p, pSet);
});
// add to internal copy of network
ActData.instance().rxnHasSeq.put(rxnid, rxn.hasProteinSeq());
/* -------- Reaction Classes ------- */
Pair<Set<Long>, Set<Long>> rxnClass = Pair.of(inputSet, outputSet);
if (!ActData.instance().rxnClasses.contains(rxnClass)) {
// the first reaction that shows up in this class, get to
// represent the entire class. So we install it in the
// datasets mirroring the non-class structures...
ActData.instance().rxnClassesSubstrates.put(rxnid, inputSet);
ActData.instance().rxnClassesProducts.put(rxnid, outputSet);
ActData.instance().rxnClasses.add(rxnClass);
}
}
public static void annotateRxnEdges(Reaction rxn, HashSet<Edge> rxn_edges) {
for (Edge e : rxn_edges) {
Edge.setAttribute(e, "isRxn", true);
Edge.setAttribute(e, "datasource", rxn.getDataSource());
Edge.setAttribute(e, "srcRxnID", rxn.getUUID());
Edge.setAttribute(e, "srcRxn", rxn.getReactionName());
if (rxn.getECNum() != null)
Edge.setAttribute(e, "srcRxnEC", rxn.getECNum());
}
}
public static boolean isCofactor(long m) {
return ActData.instance().cofactors.contains(m);
}
private static void logProgress(String format, Object... args) {
if (!GlobalParams.LOG_PROGRESS)
return;
System.err.format(_fileloc + ": " + format, args);
}
private static void logProgress(String msg) {
if (!GlobalParams.LOG_PROGRESS)
return;
System.err.println(_fileloc + ": " + msg);
}
private List<Long> getAllIDsSorted() {
List<Long> allids = this.db.getAllReactionUUIDs();
Collections.sort(allids);
return allids;
}
private Set<Long> getNatives() {
Set<Long> natives_ids = new HashSet<>();
if (this.optional_universal_inchis == null) {
// pull whatever is in the DB
List<Chemical> cs = this.db.getNativeMetaboliteChems();
for (Chemical c : cs)
natives_ids.add(c.getUuid());
} else {
// use the inchis provided to the constructor
for (String inchi : this.optional_universal_inchis) {
Chemical c = this.db.getChemicalFromInChI(inchi);
if (c == null) {
logProgress("LoadAct: WARNING: Starting native not in db.");
logProgress("LoadAct: : InChI = " + inchi);
continue;
}
natives_ids.add(c.getUuid());
}
}
logProgress(String.format("%d natives loaded into Act",natives_ids.size()));
return natives_ids;
}
private Set<Long> getCofactors() {
Set<Long> cofactor_ids = new HashSet<>();
if (this.optional_cofactor_inchis == null) {
// pull whatever is in the DB
List<Chemical> cs = this.db.getCofactorChemicals();
// TODO Add filter for chemicals w/o inchis
for (Chemical c : cs)
if (c.getInChI() != null) {
cofactor_ids.add(c.getUuid());
}
} else {
// use the inchis provided to the constructor
for (String inchi : this.optional_cofactor_inchis) {
Chemical c = this.db.getChemicalFromInChI(inchi);
if (c == null) {
logProgress("LoadAct: SEVERE WARNING: Starting cofactor not in db.");
logProgress("LoadAct: : InChI = " + inchi);
continue;
}
cofactor_ids.add(c.getUuid());
}
}
return cofactor_ids;
}
public void setFieldForExtraChemicals(String f) {
this.fieldSetForChemicals.add(f);
}
private HashMap<String, List<Long>> getChemicalWithUserSpecFields() {
HashMap<String, List<Long>> specials = new HashMap<>();
for (String f : this.fieldSetForChemicals) {
List<Chemical> cs = this.db.getChemicalsThatHaveField(f);
specials.put(f, extractChemicalIDs(cs));
}
return specials;
}
private Set<Long> getMetaCycBigMolsOrRgrp() {
HashSet<Long> ids = new HashSet<>();
DBIterator chemCursor = this.db.getIdCursorForFakeChemicals();
while (chemCursor.hasNext()) {
ids.add((Long) chemCursor.next().get("_id"));
}
chemCursor.close();
return ids;
}
private List<Long> extractChemicalIDs(List<Chemical> cs) {
List<Long> cids = new ArrayList<Long>();
for (Chemical c : cs)
cids.add(c.getUuid());
return cids;
}
private void addReactionsToNetwork() {
// Ignore reactions with "protein" in their description. This is only necessary because there are some problems in
// data parsing where we don't always put the proteins into the data, so sometimes fake data looks real.
BasicDBObject noFakeReactions = new BasicDBObject("easy_desc", new BasicDBObject("$regex", "^((?!protein).)*$"));
DBIterator iterator = this.db.getIteratorOverReactions(noFakeReactions, null);
Reaction r;
Map<Reaction.RxnDataSource, Integer> counts = new HashMap<>();
for (Reaction.RxnDataSource src : Reaction.RxnDataSource.values())
counts.put(src, 0);
// since we are iterating until the end,
// the getNextReaction call will close the DB cursor...
while ((r = this.db.getNextReaction(iterator)) != null) {
// this rxn comes from a datasource, METACYC, BRENDA or KEGG.
// ensure the configuration tells us to include this datasource...
Reaction.RxnDataSource src = r.getDataSource();
Set<Reaction> reactionsWithAccurateDirections = r.correctForReactionDirection();
counts.put(src, counts.get(src) + reactionsWithAccurateDirections.size());
// Correct for right-to-left and reversible actions, adding all appropriate directions to the graph.
for (Reaction directedRxn : reactionsWithAccurateDirections) {
addToNw(directedRxn);
}
}
logProgress("");
logProgress("Rxn aggregate into %d classes.\n", ActData.instance().rxnClasses.size());
}
@Override
public double percentDone() {
return 100.0 * ((double) this.loaded / this.total);
}
@Override
public void doMoreWork() {
logProgress("Pulling %d reactions from MongoDB:\n", this.total);
addReactionsToNetwork();
this.loaded = this.total;
}
@Override
public void init() {
ActData.instance().allrxnids = getAllIDsSorted();
total = ActData.instance().allrxnids.size();
loaded = 0;
ActData.instance().cofactors = getCofactors();
ActData.instance().natives = getNatives();
ActData.instance().chemicalsWithUserField = getChemicalWithUserSpecFields();
ActData.instance().chemicalsWithUserField_treeOrganic = new HashSet<>();
ActData.instance().chemicalsWithUserField_treeArtificial = new HashSet<>();
ActData.instance().metaCycBigMolsOrRgrp = getMetaCycBigMolsOrRgrp();
ActData.instance().chemsReferencedInRxns = new HashSet<>();
ActData.instance().chemsInAct = new HashMap<>();
ActData.instance().rxnsInAct = new HashMap<>();
ActData.instance().rxnSubstrates = new HashMap<>();
ActData.instance().rxnsThatConsumeChem = new HashMap<>();
ActData.instance().rxnsThatProduceChem = new HashMap<>();
ActData.instance().rxnProducts = new HashMap<>();
ActData.instance().rxnOrganisms = new HashMap<>();
ActData.instance().rxnSubstratesCofactors = new HashMap<>();
ActData.instance().rxnProductsCofactors = new HashMap<>();
ActData.instance().rxnHasSeq = new HashMap<>();
ActData.instance().rxnClassesSubstrates = new HashMap<>();
ActData.instance().rxnClassesProducts = new HashMap<>();
ActData.instance().rxnClasses = new HashSet<>();
ActData.instance().rxnClassesThatConsumeChem = new HashMap<>();
ActData.instance().rxnClassesThatProduceChem = new HashMap<>();
ActData.instance().noSubstrateRxnsToProducts = new HashMap<>();
logProgress("Initialization complete.");
}
@Override
public void finalize(TaskMonitor tm) {
logProgress("ComputeReachablesTree.. starting");
ActData.instance().chemToxicity = new HashMap<Long, Set<Integer>>();
ActData.instance().chemInchis = new HashMap<String, Long>();
ActData.instance().chemId2Inchis = new HashMap<Long, String>();
ActData.instance().chemId2ReadableName = new HashMap<Long, String>();
ActData.instance().chemIdIsAbstraction = new HashMap<Long, Boolean>();
pullChemicalsReferencedInRxns();
ActData.instance().chemsReferencedInRxns.addAll(ActData.instance().cofactors);
ActData.instance().chemsReferencedInRxns.addAll(ActData.instance().natives);
for (String f : ActData.instance().chemicalsWithUserField.keySet())
ActData.instance().chemsReferencedInRxns.addAll(ActData.instance().chemicalsWithUserField.get(f));
// computes reachables tree and writes it into ActData.instance().ActTree
new ComputeReachablesTree(this.db);
}
private void pullChemicalsReferencedInRxns() {
int N = ActData.instance().chemsReferencedInRxns.size();
int count = 0;
logProgress("Extracting metadata from chemicals.");
for (Long id : ActData.instance().chemsReferencedInRxns) {
logProgress("\t pullChemicalsReferencedInRxns: %d\r", count++);
Chemical c = this.db.getChemicalFromChemicalUUID(id);
ActData.instance().chemInchis.put(c.getInChI(), id);
ActData.instance().chemId2Inchis.put(id, c.getInChI());
ActData.instance().chemIdIsAbstraction.put(id, isAbstractInChI(c.getInChI()));
String name = c.getShortestBRENDAName();
if (name == null) {
// see if there is a metacyc name:
Object meta = c.getRef(Chemical.REFS.METACYC, new String[] { "meta" });
if (meta != null) {
// if we are here, the entry was referenced in metacyc, so must
// have some name association there. see if we can pull that out.
// the xref.METACYC.meta field should *always* be a JSONArray
// (even if its an array with a single JSONObject within it)
// sanity check that, and abort if the type does not match
if (!(meta instanceof JSONArray)) {
throw new RuntimeException("Expect only Arrays in db.chemicals.{xref.METACYC.meta}, but found: " + meta.getClass());
}
name = ((JSONObject) ((JSONArray)meta).get(0) ).getString("sname");
// if failed to pull out a name from metacyc, report it
if (name == null)
System.out.println("ERROR: Looks like a metacyc entry chemical, but no metacyc name: " + id);
}
}
if (name == null) {
// Try to use wikipedia
Object meta = c.getRef(Chemical.REFS.WIKIPEDIA, new String[] { "metadata" });
if (meta != null) {
if (!(meta instanceof JSONObject)) {
throw new RuntimeException("Unable to parse Wikipedia metadata.");
}
name = (String) ((JSONObject) meta).get("article");
}
}
if (name == null) {
// stuff the inchi into the name
name = c.getInChI();
}
ActData.instance().chemId2ReadableName.put(id, name);
if (SET_METADATA_ON_NW_NODES) {
String[] xpath = { "metadata", "toxicity" };
Object o = c.getRef(REFS.DRUGBANK, xpath);
if (o == null || !(o instanceof String))
continue;
Set<Integer> ld50s = extractLD50vals((String)o);
ActData.instance().chemToxicity.put(id, ld50s);
// set chemical attributes
String txt = null; // D MongoDB.chemicalAsString(c, id);
Set<Integer> tox = ActData.instance().chemToxicity.get(id);
Long n1 = ActData.instance().chemsInAct.get(id).getIdentifier();
int fanout = ActData.instance().rxnsThatConsumeChem.containsKey(id) ? ActData.instance().rxnsThatConsumeChem.get(id).size() : -1;
int fanin = ActData.instance().rxnsThatProduceChem.containsKey(id) ? ActData.instance().rxnsThatProduceChem.get(id).size() : -1;
setMetadata(n1, tox, c, txt, fanout, fanin);
}
}
logProgress("");
}
private boolean isAbstractInChI(String inchi) {
// Create a Pattern object
Pattern r = Pattern.compile("^InChI=1S\\/[A-Z0-9]*R");
return r.matcher(inchi).find();
}
private Set<Integer> extractLD50vals(String annotation) {
// an example of what we want to process is: "Oral, mouse: LD50 = 338 mg/kg; Oral, rat: LD50 = 1944 mg/kg"
// a second example of what to process is: "Acute oral toxicity (LD<sub>50</sub>) in rats is 264 mg/kg."
// a third example of what to process is : "Oral mouse and rat LD<sub>50</sub> are 338 mg/kg and 425 mg/kg respectively"
// a fourth example of what to process is: "oral LD<sub>50</sub>s were 1,100-1,550 mg/kg; 1,450 mg/kg; and 1,490 mg/kg; respectively"
// most of time it is specified as mg/kg but rarely we also have g/kg: "Oral LD50 in rat: >5 g/kg"
Set<Integer> ld50s = new HashSet<Integer>();
int idx = 0;
int len = annotation.length();
String[] locator = { "LD50", "LD<sub>50</sub>" };
int[] locator_len = { locator[0].length(), locator[1].length() };
int delta = 60; // 60 chars +- the locator
for (int l = 0; l<locator.length; l++) {
while ((idx = annotation.indexOf(locator[l], idx)) != -1) {
// look around a few tokens to find numbers that we can use...
int low = idx - delta < 0 ? 0 : idx - delta;
int high = idx + delta > len ? len : idx + delta;
String sub = annotation.substring(low, idx) + annotation.substring(idx + locator_len[l], high);
Scanner scan = new Scanner(sub).useDelimiter("[^0-9,]+");
while (scan.hasNext()) {
String scanned = scan.next().replaceAll(",", "");
try { ld50s.add(Integer.parseInt(scanned)); }
catch (NumberFormatException e) { }
}
idx++; // so that we skip the occurrence that we just added
}
}
return ld50s;
}
private void setMetadata(Long n, Set<Integer> tox, Chemical c, String fulltxt, int fanout, int fanin) {
Node.setAttribute(n, "isNative", c.isNative());
Node.setAttribute(n, "fanout", fanout);
Node.setAttribute(n, "fanin", fanin);
if (c.getCanon() != null) Node.setAttribute(n, "canonical", c.getCanon());
if (c.getInChI() != null) Node.setAttribute(n, "InChI", c.getInChI());
if (c.getSmiles() != null) Node.setAttribute(n, "SMILES", c.getSmiles());
if (c.getShortestName() != null) Node.setAttribute(n, "Name", c.getShortestName());
if (c.getBrendaNames() != null && c.getSynonyms() != null) Node.setAttribute(n, "Synonyms", c.getBrendaNames().toString() + c.getSynonyms().toString());
setXrefs(n, c);
if (fulltxt != null) Node.setAttribute(n, "fulltxt", fulltxt);
if (tox != null && tox.size() > 0) {
Node.setAttribute(n, "toxicity_all", tox.toString());
int max = Integer.MIN_VALUE, min = Integer.MAX_VALUE;
for (int i : tox) {
max = max < i ? i : max;
min = min > i ? i : min;
}
Node.setAttribute(n, "toxicity_min", min);
Node.setAttribute(n, "toxicity_max", max);
}
}
private void setXrefs(Long node, Chemical c) {
for (REFS typ : Chemical.REFS.values()) {
if (c.getRef(typ) != null) {
Double valuation = c.getRefMetric(typ);
Node.setAttribute(node, typ.name(), c.getRef(typ).toString());
Node.setAttribute(node, "metric" + typ.name(), valuation == null ? -999999999.0 : valuation);
Node.setAttribute(node, "log10metric" + typ.name(), valuation == null ? -99.0 : Math.log10(valuation));
Node.setAttribute(node, "has" + typ.name(), true);
} else {
Node.setAttribute(node, "has" + typ.name(), false);
}
}
}
}