/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.reachables; import act.server.MongoDB; import act.shared.helpers.P; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; public class WavefrontExpansion { private static String _fileloc = "com.act.reachables.WavefrontExpansion"; // these are computed once and are finalized thereafter Set<Long> cofactors_and_natives; Long root = -1L; Long rootProxyInLayer1 = -2L; // everything below changes as part of the reachables expansion Set<Long> R; // list of reachable chems HashMap<Integer, Set<Long>> R_by_layers; HashMap<Integer, Set<Long>> R_by_layers_in_host; HashMap<Long, Long> R_parent; // chosen parent at layer i-1 for a chem at layer i HashMap<Long, Set<Long>> R_owned_children; // final set of children owned by parent (key) HashMap<Long, Set<Long>> R_parent_candidates; // list of candidates in layer i-1 that could be parents for layer i chem HashMap<Long, List<Long>> rxn_needs; // list of precondition chemicals for each chem Set<Long> roots; // under "CreateUnreachableTrees" we also compute conditionally reachable trees rooted at important assumed nodeMapping int currentLayer; // when computing reachables, we log the sequences // that create new reachables Set<Long> seqThatCreatedNewReachable; // and the sequences that have enabled substrates, // irrespective of whether they create new or not Set<Long> seqWithReachableSubstrates; // when doing assumed_reachable world, the parents of a node deep in the tree get stolen, // and then we have to attach the node directly to the root of the tree. We log those in // this hashmap to make sure that we can annotate those edges in the front-end Set<Long> isAncestorAndNotDirectParent; Set<Long> R_assumed_reachable; Set<Long> R_saved; HashMap<Long, List<Long>> rxn_needs_saved; WavefrontExpansion () { this.R = new HashSet<Long>(); this.R_by_layers = new HashMap<Integer, Set<Long>>(); this.R_by_layers_in_host = new HashMap<Integer, Set<Long>>(); this.cofactors_and_natives = new HashSet<Long>(); this.R_parent = new HashMap<Long, Long>(); this.R_parent_candidates = new HashMap<Long, Set<Long>>(); this.R_owned_children = new HashMap<Long, Set<Long>>(); this.rxn_needs = computeRxnNeeds(); this.currentLayer = 0; this.isAncestorAndNotDirectParent = new HashSet<Long>(); this.seqWithReachableSubstrates = new HashSet<Long>(); this.seqThatCreatedNewReachable = new HashSet<Long>(); this.roots = new HashSet<Long>(); roots.add(this.root); } private static void logProgress(String format, Object... args) { if (!GlobalParams.LOG_PROGRESS) return; System.err.format(_fileloc + ": " + format, args); } private static void logProgress(String msg) { if (!GlobalParams.LOG_PROGRESS) return; System.err.println(_fileloc + ": " + msg); } public static Integer countCarbons(String inchi) { String[] spl = inchi.split("/"); // For lone atoms like carbon, the inchi is InChI=1S/C, which is valid. So any other inchi with less than 2 // components is invalid. if (spl.length < 2) return null; String formula = spl[1]; // The below regex pattern will match all atoms and their counts. ASSUMPTION: The first letter of the atom is // a capital letter followed by lower case letters. Pattern patternToMatchAllAtomsAndTheirCounts = Pattern.compile("([A-Z][a-z]*)([0-9]*)"); Matcher matcher = patternToMatchAllAtomsAndTheirCounts.matcher(formula); while (matcher.find()) { // We are guaranteed to have two groups based on the pattern, but the numeric category could be an empty string. if (matcher.group(1).equals("C")) { return matcher.group(2).equals("") ? 1 : Integer.parseInt(matcher.group(2)); } } return 0; } public Tree<Long> expandAndPickParents() { ActData.instance().natives.forEach(this::addToReachablesAndCofactorNatives); ActData.instance().cofactors.forEach(this::addToReachablesAndCofactorNatives); System.out.format("Size of cofactors_and_natives: %d\n", this.cofactors_and_natives.size()); logProgress("Starting computeTree"); logProgress("Cofactors and natives = " + this.cofactors_and_natives); // add the natives and cofactors addToLayers(R, 0 /* this.currentLayer */, false /* addToExisting */, false /* isInsideHost */); this.currentLayer++; updateEnabled(R); setParentsForCofactorsAndNatives(cofactors_and_natives); Set<Long> doNotAssignParentsTo = new HashSet<Long>(); Set<Long> possibleBigMols = ActData.instance().metaCycBigMolsOrRgrp; // those with InChI:/FAKE/ are either big molecules (no parents), or R group containing chemicals. Either, do not complain if we cannot find parents for them. if (GlobalParams.actTreeCreateHostCentricMap) { // add all host organism reachables int host_layer = 0; while (anyEnabledReactions(GlobalParams.gethostOrganismID())) { logProgress("Current layer (inside host expansion): " + this.currentLayer); boolean newAdded = pushWaveFront(GlobalParams.gethostOrganismID(), host_layer); if (newAdded) { // temporary.... pickParentsForNewReachables(this.currentLayer, host_layer++, doNotAssignParentsTo, possibleBigMols, null /*no assumptions*/); } } this.currentLayer++; // now outside host } // compute layers while (anyEnabledReactions(null)) { logProgress("layer = " + this.currentLayer + "; num_reachables = " + this.R.size()); boolean newAdded = pushWaveFront(null, this.currentLayer); pickParentsForNewReachables(this.currentLayer++, -1 /* outside host */, doNotAssignParentsTo, possibleBigMols, null /*no assumptions*/); } if (GlobalParams.actTreeCreateUnreachableTrees) { List<EnvCond>[] workLists = worklistOfAssumedReachables(); Set<Long> allReach = new HashSet<Long>(); allReach.addAll(addUnreachableTrees(workLists[0], this.R)); // handle all singular assumed_reachables allReach.addAll(addUnreachableTrees(workLists[1], allReach)); // handle all tuples of assumed_reachables this.R_assumed_reachable = allReach; } else { this.R_assumed_reachable = new HashSet<Long>(); } addNodesThatHaveUserSpecifiedFields(); Set<Long> still_unreach = new HashSet<Long>(ActData.instance().chemsReferencedInRxns); still_unreach.removeAll(this.R); still_unreach.removeAll(this.R_assumed_reachable); logProgress("Reachables size: %s\n", this.R.size()); logProgress("Assumed reachables size: %s\n", this.R_assumed_reachable.size()); logProgress("Still unreachable size: %s\n", still_unreach.size()); return new Tree<Long>(getRoots(), this.R_parent, this.R_owned_children, constructAttributes()); } private void addNodesThatHaveUserSpecifiedFields() { Long artificialSubtreeID = -100L; for (String f : ActData.instance().chemicalsWithUserField.keySet()) { List<Long> molecules = ActData.instance().chemicalsWithUserField.get(f); Long artRootID = artificialSubtreeID--; // make this new predicate a child of the root this.R_owned_children.get(this.root).add(artRootID); this.R_parent.put(artRootID, this.root); // add all artifially reachable molecules to layer 2 and parent to layer 1 this.R_by_layers.get(1).add(artRootID); for (Long c : molecules) { // if the molecule is already reachable, we should // not add it in the artificial reachable clade if (this.R.contains(c)) { // log that it was already reached organically ActData.instance().chemicalsWithUserField_treeOrganic.add(c); continue; } // log that it could not be reached organically ActData.instance().chemicalsWithUserField_treeArtificial.add(c); // set it to be artificially reachable this.R.add(c); this.R_by_layers.get(2).add(c); // set its parent to the artificial predicate node this.R_parent.put(c, artRootID); if (!this.R_owned_children.containsKey(artRootID)) this.R_owned_children.put(artRootID, new HashSet<Long>()); this.R_owned_children.get(artRootID).add(c); } } } private void addToReachablesAndCofactorNatives(Long c) { this.R.add(c); this.cofactors_and_natives.add(c); } private List<EnvCond>[] worklistOfAssumedReachables() { // read all reactions in rxn_needs, check their "needs" and create a speculation tuple // out of those needs. Ensure that you keep a count of the number of times the tuple is // seen. Then output the sorted list. HashMap<EnvCond, Integer> counts = new HashMap<EnvCond, Integer>(); for (Long r : this.rxn_needs.keySet()) { EnvCond tuple = new EnvCond(this.rxn_needs.get(r)); counts.put(tuple, 1 + (counts.containsKey(tuple) ? counts.get(tuple) : 0)); } return sortByCountsAndPrioritizeSingulars(counts); } private List<EnvCond>[] sortByCountsAndPrioritizeSingulars(HashMap<EnvCond, Integer> counts) { List<P<EnvCond, Integer>> sc = new ArrayList<P<EnvCond, Integer>>(); for (EnvCond se : counts.keySet()) sc.add(new P<EnvCond, Integer>(se, counts.get(se))); Collections.sort(sc, new DescendingComparor<EnvCond>()); List<EnvCond> singles = new ArrayList<EnvCond>(); List<EnvCond> tuples = new ArrayList<EnvCond>(); for (P<EnvCond, Integer> e : sc) if (e.fst().speculatedChems().size() > 1) tuples.add(e.fst()); else singles.add(e.fst()); List<EnvCond>[] a = new List[] { singles, tuples }; return a; } private Set<Long> addUnreachableTrees(List<EnvCond> worklist, Set<Long> alreadyReached) { logProgress("Will process %d assumptions\n", worklist.size()); /* // cache the reachables (R, R_by_layers, anything else?) // P { // for each remaining unreachable X // assume_reachable(X), compute new reachables and put the set of them as reachable_under(X) // pop the reachables back to the cached values // for each remaining unreachable X // find X with the largest reachable_under(X) // pickParents for X's tree // set reachable for all new acquires // } // for each remaining unreachable_tuple Y, do P with X=Y */ // save the current state by doing a deep copy // saves this.R and this.rxn_needs saveState(); List<P<EnvCondEffect, Integer>> assumptionOutcomes = new ArrayList<P<EnvCondEffect, Integer>>(); int count = 0; int total_sz = worklist.size(), sz; while ((sz = worklist.size()) > 0) { if ((10 * sz) / total_sz != (10 * (sz + 1) / total_sz)) logProgress("Completed %d0 percent of unreachable computation\r", 10 - (10 * sz) / total_sz); if (GlobalParams._limitedPreconditionsConsidered != 0 && count++ >= GlobalParams._limitedPreconditionsConsidered) break; // premature termination, dictated by the front-end. EnvCond envCond = worklist.remove(0); restoreState(); // pop to normal reachability: restore this.R, this.rxn_needs this.currentLayer++; int startingLayer = this.currentLayer; Set<Long> assumedReachable = envCond.speculatedChems(); addToLayers(assumedReachable, this.currentLayer++, false /* addToExisting */, false /* isInsideHost */); this.R.addAll(assumedReachable); updateEnabled(assumedReachable); while (anyEnabledReactions(null)) { // reads: rxn_needs // updates: rxn_needs (removes preconditions of new reaches) // add to: R, R_by_layer, R_parent_candidates, boolean newAdded = pushWaveFront(null, this.currentLayer++); } // delta from saved_R, modulo the chems we assumed and added as is Set<Long> newReach = deepCopy(this.R); newReach.removeAll(this.R_saved); int newReachCount = newReach.size() - envCond.speculatedChems().size(); EnvCondEffect effect = new EnvCondEffect(envCond, newReachCount, newReach, startingLayer, this.currentLayer); assumptionOutcomes.add(new P<EnvCondEffect, Integer>(effect, newReachCount)); } // pop to normal reachability: restore this.R, this.rxn_needs restoreState(); Set<Long> allReach = deepCopy(alreadyReached); // those with InChI:/FAKE/ are either big molecules (no parents), or // R group containing chemicals. Either, do not complain if we // cannot find parents for them. Set<Long> possibleBigMols = ActData.instance().metaCycBigMolsOrRgrp; Collections.sort(assumptionOutcomes, new DescendingComparor<EnvCondEffect>()); for (int idx = 0; idx < assumptionOutcomes.size(); idx++) { EnvCondEffect newTreeData = assumptionOutcomes.get(idx).fst(); if (newTreeData.sizeNewReach <= GlobalParams.actTreeMinimumSizeOfConditionalTree) continue; // assumed nodeMapping that do not enable even a single other are irrelevant // for now we only handle the case of single node precondition. // Ideally, for tuples, we will create a circle of the nodeMapping that // make the preconditon, and then have the tree hang off it for (Long r : newTreeData.e.speculatedChems()) if (!allReach.contains(r)) { this.roots.add(r); // the precondition is a root of a new disconnected tree } for (int layer = newTreeData.startingLayer + 1; layer < newTreeData.endingLayer; layer++) { // reads: R_by_layers[current, current-1], R_parent_candidates // adds to: R_parent, R_owned_children Set<Long> doNotAssignParentsTo = allReach; // because these are already in some part of the tree pickParentsForNewReachables(layer, -1 /* outside host */, doNotAssignParentsTo, possibleBigMols, newTreeData.e); } // everyone under this assumed subtree was either "parented" in this iteration, // or in a previous iteration, when a larger parent acquired it. In any case, // they are all now reachable. allReach.addAll(newTreeData.newReach); } return allReach; } private void saveState() { this.R_saved = deepCopy(this.R); this.rxn_needs_saved = deepCopy(this.rxn_needs); } private void restoreState() { this.R = deepCopy(this.R_saved); this.rxn_needs = deepCopy(this.rxn_needs_saved); } private <T,S> HashMap<T, List<S>> deepCopy(HashMap<T, List<S>> map) { HashMap<T, List<S>> copy = new HashMap<T, List<S>>(); for (T r : map.keySet()) copy.put(r, new ArrayList<S>(map.get(r))); return copy; } private <T> Set<T> deepCopy(Set<T> parentR) { return new HashSet<T>(parentR); } private Set<Long> getRoots() { return this.roots; } private HashMap<Long, Object> constructAttributes() { HashMap<Long, Object> attributes = new HashMap<Long, Object>(); for (int layer : this.R_by_layers.keySet()) { for (Long n : this.R_by_layers.get(layer)) { HashMap<String, Integer> attr = new HashMap<String, Integer>(); boolean isInsideHost = layer == 1; if (isInsideHost) { // host reachables // globalLayer->1, hostLayer=getHostLayerOf(n) attr.put("globalLayer", 1); attr.put("hostLayer", getHostLayerOf(n)); } else { // globalLayer->layer, hostLayer=-1 attr.put("globalLayer", layer); attr.put("hostLayer", -1); } attributes.put(n, attr); } } // for any of the disjoint assumed_reachable trees, create dummy layer values HashMap<String, Integer> condReachAttr = new HashMap<String, Integer>(); condReachAttr.put("globalLayer", -2); condReachAttr.put("hostLayer", -2); for (Long condReach : this.R_assumed_reachable) { if (attributes.containsKey(condReach)) continue; attributes.put(condReach, condReachAttr); } HashMap<String, Integer> rootAttr = new HashMap<String, Integer>(); rootAttr.put("globalLayer", -1); rootAttr.put("hostLayer", -1); for (Long root : getRoots()) attributes.put(root, rootAttr); attributes.put(this.rootProxyInLayer1, rootAttr); // exception: rootProxy is not in the roots list because we want a 1-1 between roots and disjoint trees HashMap<String, Integer> isAttachedToAncestor = new HashMap<String, Integer>(); isAttachedToAncestor.put("attachedDirectlyToRoot", 1); for (Long orphan : this.isAncestorAndNotDirectParent) { attributes.put(orphan, isAttachedToAncestor); } return attributes; } private HashMap<Long, List<Long>> computeRxnNeeds() { // use the following as the universe of reactions to enumerate over HashMap<Long, Set<Long>> substrates_dataset = GlobalParams.USE_RXN_CLASSES ? ActData.instance().rxnClassesSubstrates : ActData.instance().rxnSubstrates; HashMap<Long, List<Long>> needs = new HashMap<>(); int ignored_noseq = 0, total = 0; logProgress("Processing all rxns for rxn_needs: %d\n", substrates_dataset.size()); for (Long r : substrates_dataset.keySet()) { Set<Long> substrates = substrates_dataset.get(r); // do not add reactions that don't have a sequence; unless the flag to be liberal is set if (GlobalParams._actTreeOnlyIncludeRxnsWithSequences) { if (!ActData.instance().rxnHasSeq.get(r)) { ignored_noseq++; continue; } } total++; needs.put(r, new ArrayList<>(substrates)); } if (GlobalParams._actTreeOnlyIncludeRxnsWithSequences) logProgress("Ignored %d reactions that had no sequence. Total were %d\n", ignored_noseq, total); return needs; } protected Set<Long> productsOf(Set<Long> enabledRxns) { // use the following as the universe of reactions to enumerate over HashMap<Long, Set<Long>> substrates_dataset = GlobalParams.USE_RXN_CLASSES ? ActData.instance().rxnClassesSubstrates : ActData.instance().rxnSubstrates; HashMap<Long, Set<Long>> products_dataset = GlobalParams.USE_RXN_CLASSES ? ActData.instance().rxnClassesProducts : ActData.instance().rxnProducts; Set<Long> P = new HashSet<Long>(); for (Long r : enabledRxns) { Set<Long> products_raw = products_dataset.get(r); Set<Long> products_made = productsThatAreNotAbstract(products_raw); P.addAll(products_made); // for each product, the substrates of these reactions // are potential candidate parents for it, so we add them // to the list of candidates that we will process later. for (Long p : products_made) { // never made cofactors or natives parents if (cofactors_and_natives.contains(p)) continue; // Add the substrates of the reactions as // the options for parents for the products, // the elements in P might not be new reachables, // but that is ok, since we will only assign a parent in layer i-1 Set<Long> parent_candidates = new HashSet<Long>(); parent_candidates.addAll(substrates_dataset.get(r)); // -- without adding cofactors, there are reactions // in which the products will have no option of parents, // so we have to allow cofactors in the parent candidates // but at the same time, some are really bad parents, // e.g., water and ATP, so at the end we blacklist them as owning parents if (!this.R_parent_candidates.containsKey(p)) this.R_parent_candidates.put(p, new HashSet<>()); this.R_parent_candidates.get(p).addAll(parent_candidates); } } return P; } private Set<Long> productsThatAreNotAbstract(Set<Long> ps) { Set<Long> nonAbstract = new HashSet<Long>(); for (Long p : ps) { Boolean isAbstract = ActData.instance().chemIdIsAbstraction.get(p); if (isAbstract != null && !isAbstract) { nonAbstract.add(p); } } return nonAbstract; } private Long pickMostSimilar(Long p, Set<Long> ss) { String prod = ActData.instance().chemId2Inchis.get(p); Integer numCprod, numCsubstrate; if (prod == null || (numCprod = countCarbons(prod)) == null) return null; int closest = 10000000; // these many carbons away Long closestID = null; for (Long s : ss) { String substrate = ActData.instance().chemId2Inchis.get(s); if (substrate == null || (numCsubstrate = countCarbons(substrate)) == null) continue; int delta = Math.abs(numCsubstrate - numCprod); if (closest > delta) { closest = delta; closestID = s; } } return closestID; } /* checks "rxn_needs" for the enabled reactions * picks up the enabled products using the enabled reactions * adds whichever ones are new reachables to "R" * * marks the products with possible parents as the substrates * of the enabled reactions into "R_parent_candidates" * * adds to layer either this.R_by_layers_in_host (orgID!=null) * or this.R_by_layers (orgID==null) * * updates the "rxn_needs" by removing the preconditions in * case new reactions are enabled with these new Reachables * * so: reads and updates this.rxn_needs, add to this.R, * adds to this.R_parent_candidates, * adds to this.R_by_layer (or in_host if orgID!=null) */ private boolean pushWaveFront(Long orgID, int layer) { boolean isInsideHost = orgID != null; Set<Long> enabledRxns = extractEnabledRxns(orgID); if (isInsideHost) logProgress("Org: %d, num enabled rxns: %d\n", orgID, enabledRxns.size()); Set<Long> newReachables = productsOf(enabledRxns); Set<Long> uniqNew = new HashSet<Long>(newReachables); uniqNew.removeAll(R); if (uniqNew.size() > 0) { addToLayers(uniqNew, layer, true /* add to existing layer */, isInsideHost); } R.addAll(newReachables); updateEnabled(newReachables); return uniqNew.size() > 0; // at least one new node in this layer } protected boolean anyEnabledReactions(Long orgID) { for (Long r : this.rxn_needs.keySet()) { if (orgID == null || ActData.instance().rxnOrganisms.get(r).contains(orgID)) if (this.rxn_needs.get(r).isEmpty()) return true; } return false; } protected Set<Long> extractEnabledRxns(Long orgID) { Set<Long> enabled = new HashSet<Long>(); for (Long r : this.rxn_needs.keySet()) { if (this.rxn_needs.get(r).isEmpty()) { // if no orgID specified: add all rxns from any organism, // if orgID is specified: only if the reaction happens in the org if (orgID == null || ActData.instance().rxnOrganisms.get(r).contains(orgID)) enabled.add(r); } } for (Long r : enabled) this.rxn_needs.remove(r); return enabled; } protected void updateEnabled(Set<Long> newReachables) { for (Long r : this.rxn_needs.keySet()) { List<Long> needs = new ArrayList<Long>(); for (Long l : this.rxn_needs.get(r)) { if (!newReachables.contains(l)) needs.add(l); } this.rxn_needs.put(r, needs); } } private void setParentsForCofactorsAndNatives(Set<Long> R) { // the native and cofactors have the root as their parent for (Long child : R) { this.R_parent.put(child, this.root); } this.R_parent.put(this.rootProxyInLayer1, this.root); Set<Long> childrenOfTheRoot = new HashSet<Long>(R); childrenOfTheRoot.add(this.rootProxyInLayer1); this.R_owned_children.put(this.root, childrenOfTheRoot); } /* * reads R_by_layers (or _in_host) for current layer and layer-1 * for every node in current layer, finds * candidate parents from "R_parent_candidates" minus cofactors * and one that is most_similar structurally (if none then all * parents are candidates * * Then goes through the possible parents and greedily picks ones * that can acquire most children, installs them in R_parent, R_owned_children * * so: reads from R_by_layers[current, current-1], and R_parent_candidates * and writes to R_parent, R_owned_children */ private void pickParentsForNewReachables(int layer, int host_layer, Set<Long> doNotChangeNeighborhoodOf, Set<Long> possibleBigMolecules, EnvCond treeRoot) { Set<Long> reachInNewLayer; Set<Long> reachInLayerAbove; // logProgress("picking parent (layer,host_layer) = (%d,%d)\n", layer, host_layer); if (host_layer == -1) { // this is a non-host layer update reachInNewLayer = this.R_by_layers.get(layer); reachInLayerAbove = this.R_by_layers.get(layer - 1); } else { // this is layer update within the host reachInNewLayer = this.R_by_layers_in_host.get(host_layer); if (host_layer == 0) // the layer above is not in the host, it is layer=0 reachInLayerAbove = this.R_by_layers.get(0 /*natives and cofactors layer*/); else reachInLayerAbove = this.R_by_layers_in_host.get(host_layer - 1); } // for each of the keys that could be a parent (in the tree), its set of possible children HashMap<Long, Set<Long>> possible_children = new HashMap<Long, Set<Long>>(); // init for (Long p : reachInLayerAbove) possible_children.put(p, new HashSet<Long>()); if (reachInNewLayer == null) return; // for each child in "layer", lookup its candidate parents and add child to parent's possible ownership for (Long child : reachInNewLayer) { boolean at_least_one_parent = false; Set<Long> candidates = this.R_parent_candidates.get(child); Long most_similar = pickMostSimilar(child, candidates); if (most_similar != null && possible_children.containsKey(most_similar)) { // ideal case, most_similar substrate is computable, and the possible_children containment // indicates that the most_similar is also in the layer directly below, and therefore the best parent possible possible_children.get(most_similar).add(child); at_least_one_parent = true; } else { // the ideal case does not pan out: because either the most_similar is uncomputed // or that the most similar is in a layer much below and the immediate precursor dependency was // some other substrate needed for the reaction. Ah well. Lets just attach this child to this other dependency for (Long parent_candidate : candidates) { // if possible_children map does not contain a parent_candidate key means it is not in the // immediate layer below and cannot be considered a direct ancestor parent, so continue; if (!possible_children.containsKey(parent_candidate)) { continue; } possible_children.get(parent_candidate).add(child); at_least_one_parent = true; } } if (!at_least_one_parent && layer == 2) { // there might be no parents to claim a child in layer 2 because it might have parents that are cofactors or natives // which means that it would have been in layer 1, except for the restriction that layer 1 is only ecoli enzymes // so we will attach it to the proxy node for the natives and cofactors which will show up at the same level as layer 1 nodeMapping if (!possible_children.containsKey(this.rootProxyInLayer1)) possible_children.put(this.rootProxyInLayer1, new HashSet<Long>()); possible_children.get(this.rootProxyInLayer1).add(child); } } // look at "layer - 1" and greedy assign parents there as many "layer" nodeMapping as possible Set<Long> still_orphan = new HashSet<>(); for (Long id : reachInNewLayer) { // the nodeMapping in "doNotChangeNeighborhoodOf" do not need to find parents; already assigned elsewhere // metacyc gives us some molecules with db.chemicals.findOne({InChI:/FAKE/}). These are either // big molecules (proteins, rna, dna and), or big molecule attached SM, or small molecule abstractions // either way.. do not worry about assigning parents to them. if (!doNotChangeNeighborhoodOf.contains(id) && !possibleBigMolecules.contains(id)) { still_orphan.add(id); } } // greedily assign children to parents while (still_orphan.size() > 0) { // there is still a child left, find the parent that can adopt the most number of children // cannot pick a parent that is in "doNotChangeNeighborhoodOf", so pass that along to the function below Long largest_parent = pick_with_largest_candidate_children(possible_children, doNotChangeNeighborhoodOf); Set<Long> adopted_children = possible_children.get(largest_parent); // these children have found a parent if (adopted_children == null) { if (treeRoot == null) { // this is the case where we are computing the true reachables and not the assumed_reachables // i.e., not under an artificial world with extra assumptions. Therefore, there should be no // orphans here. They should all be accounted for. throw new RuntimeException("Nodes that will remain orphan: " + still_orphan); } else { // this is the other case where we have a "different world assumption", i.e., there are assumed // tree roots and their corresponding descendents. Now here a problem arises when greedily assigning // parent *across worlds* (i.e., across different assumptions). Here a larger tree might have stolen // some of the parents on the route from nodeMapping to the treeRoot. In that case, this orphan has // nowhere to go: a) It cannot be part of this "world" because the parents on the route to the treeRoot // are missing in this world, and b) it cannot be part of the world where its parent was adopted, because // the reachability of this node depends on there being reachables other than its parent, which are // only present in this world and not in the alternative where its parent went. // SO: we have to make an exception, and attach this node directly to the treeRoot; BUT annotate // that this is not a direct descendent of the root. Then we can highlight those edges in the front=end. for (Long n : still_orphan) { // first, if there are multiple elements in the root assumptions, then pick the most similar Set<Long> candidates = deepCopy(treeRoot.speculatedChems()); Long most_similar = pickMostSimilar(n, candidates); if (most_similar == null) { // we do not have structure information (either for n, or for any of the candidates) // so just pick the first guy in the list of candidates to be the parent // (candidates has the invariant of being at least size>=1) most_similar = candidates.toArray(new Long[] {})[0]; } // define most_similar as n's parent addChildToParentsChildren(most_similar/*parent*/, n/*child*/); this.isAncestorAndNotDirectParent.add(n); } still_orphan.clear(); break; // all orphans are not attached to the root; exit loop. } } possible_children.remove(largest_parent); // this parent should not be considered in the future adopted_children.removeAll(doNotChangeNeighborhoodOf); installParentWithChildren(largest_parent, adopted_children); for (Long child : adopted_children) { // child has a parent now, so remove from still_orphan list still_orphan.remove(child); // clean up the possible_children lists for other parents for (Long other_parent : possible_children.keySet()) { if (possible_children.get(other_parent).contains(child)) possible_children.get(other_parent).remove(child); } } } } private <T> Set<T> intersect(Set<T> A, Set<T> B) { Set<T> intersection = new HashSet<T>(); if (A == null || B == null) return intersection; for (T a : A) if (B.contains(a)) intersection.add(a); return intersection; } private void installParentWithChildren(Long parent, Set<Long> children) { // set the parent's set of children this.R_owned_children.put(parent, children); for (Long child : children) { // set the child's parent this.R_parent.put(child, parent); } } private void addChildToParentsChildren(Long parent, Long child) { // add to the parent's set of children (as opposed to setting as in the above function) if (this.R_owned_children.get(parent) == null) this.R_owned_children.put(parent, new HashSet<Long>()); this.R_owned_children.get(parent).add(child); // set the child's parent this.R_parent.put(child, parent); } private void removeBlackListedCofactorsAsParents(Set<Long> candidates, Long child) { for (MongoDB.SomeCofactorNames cofactor : MongoDB.SomeCofactorNames.values()) { Long cofactorDBId = cofactor.getMongoDBId(); // we check size>1 because we do not want to leave the child without any parents to choose from. if (candidates.contains(cofactorDBId)) if (candidates.size() > 1) candidates.remove(cofactorDBId); else logProgress("**** RemovingBlackListCofactors: %s is a candidate parent, and the ONLY one for %d.\n", cofactor, child); } } private Long pick_with_largest_candidate_children(HashMap<Long, Set<Long>> map, Set<Long> blackList) { int max = -1; Long maxId = -1L; for (Long k : map.keySet()) { if (blackList.contains(k)) continue; if (max < map.get(k).size()) { max = map.get(k).size(); maxId = k; } } return maxId; } private int getLayerOf(Long c) { for (int l : this.R_by_layers.keySet()) if (this.R_by_layers.get(l).contains(c)) return l; return -1; } private int getHostLayerOf(Long c) { for (int l : this.R_by_layers_in_host.keySet()) if (this.R_by_layers_in_host.get(l).contains(c)) return l; if (this.R_by_layers.get(0).contains(c)) return -2; return -1; } private void addToLayers(Set<Long> nodes, int layer, boolean addToExisting, boolean isInsideHost) { Set<Long> addNodes = new HashSet<Long>(nodes); HashMap<Integer, Set<Long>> map = isInsideHost ? this.R_by_layers_in_host : this.R_by_layers; // sanity check.... if (map.containsKey(layer) && !addToExisting) { throw new RuntimeException("ERR: Layer already installed and addToExisting not requested!?"); } // really add to the particular map: if isInsideHost it will add to the host map else to the global one if (map.containsKey(layer)) addNodes.addAll(map.get(layer)); map.put(layer, addNodes); // even if we add to the host map, the global one has the aggregated set of host reachables in layer 1 if (isInsideHost) { Set<Long> aggregatedLayer1Nodes = new HashSet<Long>(nodes); if (this.R_by_layers.containsKey(1)) aggregatedLayer1Nodes.addAll(this.R_by_layers.get(1)); this.R_by_layers.put(1, aggregatedLayer1Nodes); } } public class DescendingComparor<T> implements Comparator<P<T, Integer>> { @Override public int compare(P<T, Integer> o1, P<T, Integer> o2) { return o2.snd().compareTo(o1.snd()); // o2.compare(o1): invert comparison to sort in descending } } class EnvCondEffect { EnvCond e; int sizeNewReach; int startingLayer, endingLayer; Set<Long> newReach; EnvCondEffect(EnvCond e, int sz, Set<Long> newReach, int startL, int endL) { this.e = e; this.sizeNewReach = sz; this.newReach = newReach; this.startingLayer = startL; this.endingLayer = endL; } } }