/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.biointerpretation.cofactorremoval; import act.server.NoSQLAPI; import act.shared.Chemical; import act.shared.Reaction; import com.act.biointerpretation.BiointerpretationProcessor; import com.act.biointerpretation.Utils.ReactionComponent; import com.act.biointerpretation.mechanisminspection.BlacklistedInchisCorpus; import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; import static com.act.biointerpretation.Utils.ReactionComponent.PRODUCT; import static com.act.biointerpretation.Utils.ReactionComponent.SUBSTRATE; /** * This class reads in reactions from a read DB and processes each one such that cofactors are binned together * in either substrate/product cofactor lists. It removes both concrete cofactors (ie, ones with precise inchis) * as well as abstract ones (ie, FAKE inchis). It sequentially removes the cofactors in a prioritized manner until only * one substrate and product remain. * * Uniqueness in this database is the matching of: * 1. the remaining substrate * 2. the remaining product * 3. the names of the substrate cofactors * 4. the names of the product cofactors * * Created by jca20n on 2/15/16. */ public class CofactorRemover extends BiointerpretationProcessor { private static final Logger LOGGER = LogManager.getFormatterLogger(CofactorRemover.class); private static final String PROCESSOR_NAME = "Cofactor Remover"; private static final String FAKE = "FAKE"; private FakeCofactorFinder fakeFinder; private CofactorsCorpus cofactorsCorpus; private Set<Long> knownCofactorReadDBIds = new HashSet<>(); private Set<Long> knownCofactorWriteDBIds = null; private BlacklistedInchisCorpus blacklistedInchisCorpus; @Override public String getName() { return PROCESSOR_NAME; } public CofactorRemover(NoSQLAPI api) { super(api); fakeFinder = new FakeCofactorFinder(); } public void init() throws IOException { cofactorsCorpus = new CofactorsCorpus(); cofactorsCorpus.loadCorpus(); blacklistedInchisCorpus = new BlacklistedInchisCorpus(); blacklistedInchisCorpus.loadCorpus(); markInitialized(); } @Override protected Chemical runSpecializedChemicalProcessing(Chemical chem) { return assignCofactorStatus(chem); } private Chemical assignCofactorStatus(Chemical chemical) { Long oldId = chemical.getUuid(); // First, check if the InChI needs to be updated. A few cofactors are known to have broken InChIs. String inchi = blacklistedInchisCorpus.renameInchiIfFoundInBlacklist(chemical.getInChI()); chemical.setInchi(inchi); boolean isCofactor = false; if (cofactorsCorpus.getInchiToName().containsKey(inchi)) { isCofactor = true; } else if (inchi.contains(FAKE) && (fakeFinder.scanAndReturnCofactorNameIfItExists(chemical) != null)) { // TODO: Abstract the Fake inchi checks into its own utility class. isCofactor = true; } // Set isCofactor *without* looking at previous determinations. This is the single source of truth for cofactors. chemical.setIsCofactor(isCofactor); if (isCofactor) { knownCofactorReadDBIds.add(oldId); } return chemical; } @Override protected void afterProcessChemicals() { LOGGER.info("Found %d cofactors amongst %d migrated chemicals", knownCofactorReadDBIds.size(), getOldChemIdToNewChemId().size()); LOGGER.info("Building cofactor status map for new chemical ids to facilitate cofactor removal"); knownCofactorWriteDBIds = new HashSet<>(knownCofactorReadDBIds.size()); for (Long oldId : knownCofactorReadDBIds) { knownCofactorWriteDBIds.add(mapOldChemIdToNewId(oldId)); } if (knownCofactorWriteDBIds.size() != knownCofactorReadDBIds.size()) { String msg = String.format("Old and new cofactor id sets to not match in size: %d vs. %d", knownCofactorReadDBIds.size(), knownCofactorWriteDBIds.size()); if (knownCofactorWriteDBIds.size() > knownCofactorReadDBIds.size()) { LOGGER.error(msg); throw new RuntimeException(msg); } else { LOGGER.warn("%s (might be the result of blacklisted InChI correction, " + "which can reduce the number of cofactors in the new DB)", msg); } } LOGGER.info("New cofactor id map constructed, ready to process reactions."); /* TODO: we want to prevent any further access to the old map of ids to avoid accidental use instead of * knownCofactorWriteDBIds. Is there a better way than this? */ knownCofactorReadDBIds = null; } @Override protected Reaction preProcessReaction(Reaction rxn) { findAndIsolateCoenzymesFromReaction(rxn); // Make sure the there are enough co/products and co/substrates in the processed reaction if ((rxn.getSubstrates().length == 0 && rxn.getSubstrateCofactors().length == 0) || (rxn.getProducts().length == 0 && rxn.getProductCofactors().length == 0)) { LOGGER.warn("Reaction %d does not have any products or substrates after coenzyme removal.", rxn.getUUID()); return null; } return rxn; } /** * The function removes similar chemicals from the substrates and products (conenzymes) and remove duplicates * within each category. * @param reaction The reaction being updated. */ private void findAndIsolateCoenzymesFromReaction(Reaction reaction) { // Build ordered sets of the substrates/products. LinkedHashSet<Long> substrates = new LinkedHashSet<>(Arrays.asList(reaction.getSubstrates())); LinkedHashSet<Long> products = new LinkedHashSet<>(Arrays.asList(reaction.getProducts())); // Compute the intersection between the sets. Set<Long> intersection = new HashSet<>(substrates); intersection.retainAll(products); // A - int(A, B) = A / B substrates.removeAll(intersection); products.removeAll(intersection); // Update the reaction with the new (ordered) substrates/products + coenzymes. reaction.setSubstrates(substrates.toArray(new Long[substrates.size()])); reaction.setProducts(products.toArray(new Long[products.size()])); // Keep any existing coenzymes, but don't use them when computing the difference--they might be there for a reason. intersection.addAll(Arrays.asList(reaction.getCoenzymes())); reaction.setCoenzymes(intersection.toArray(new Long[intersection.size()])); } @Override protected Reaction runSpecializedReactionProcessing(Reaction rxn, Long newId) { // Bump up the cofactors to the cofactor list and update all substrates/products and their coefficients accordingly. updateReactionProductOrSubstrate(rxn, SUBSTRATE); updateReactionProductOrSubstrate(rxn, PRODUCT); return rxn; } /** * This function is the meat of the cofactor removal process. It extracts all cofactors based on their ids and * places them in the appropriate collection within the reaciton. Note that because this is executed by * BiointerpretationProcessor's `runSpecializedReactionProcessing` hook, the chemical ids have already been updated * to reference the chemical entries in the WriteDB. * @param reaction The reaction to update. * @param component Update substrates or products. */ private void updateReactionProductOrSubstrate(Reaction reaction, ReactionComponent component) { Long[] chemIds, originalCofactorIds; if (component == SUBSTRATE) { chemIds = reaction.getSubstrates(); originalCofactorIds = reaction.getSubstrateCofactors(); } else { chemIds = reaction.getProducts(); originalCofactorIds = reaction.getProductCofactors(); } Map<Boolean, List<Long>> partitionedIds = Arrays.asList(chemIds).stream().collect(Collectors.partitioningBy(knownCofactorWriteDBIds::contains)); List<Long> cofactorIds = partitionedIds.containsKey(true) ? partitionedIds.get(true) : Collections.EMPTY_LIST; List<Long> nonCofactorIds = partitionedIds.containsKey(false) ? partitionedIds.get(false) : Collections.EMPTY_LIST; // Retain previously partitioned cofactors if any exist. if (originalCofactorIds != null && originalCofactorIds.length > 0) { // Use an ordered set to unique the partitioned and previously specified cofactors. Original cofactors go first. LinkedHashSet<Long> uniqueCofactorIds = new LinkedHashSet<>(Arrays.asList(originalCofactorIds)); uniqueCofactorIds.addAll(cofactorIds); /* We do this potentially expensive de-duplication step only in the presumably rare case that we find a reaction * that already has cofactors set. A reaction that has not already undergone cofactor removal is very unlikely to * have cofactors partitioned from substrates/products. */ cofactorIds = new ArrayList<>(uniqueCofactorIds); } // Coefficients for cofactors should automatically fall out when we update the substrate/product list. if (component == SUBSTRATE) { reaction.setSubstrateCofactors(cofactorIds.toArray(new Long[cofactorIds.size()])); reaction.setSubstrates(nonCofactorIds.toArray(new Long[nonCofactorIds.size()])); /* Coefficients should already have been set when the reaction was migrated to the new DB, so no need to update. * Note that this assumption depends strongly on the current coefficient implementation in the Reaction model. */ } else { reaction.setProductCofactors(cofactorIds.toArray(new Long[cofactorIds.size()])); reaction.setProducts(nonCofactorIds.toArray(new Long[nonCofactorIds.size()])); } } /** * Removes cofactors from a single reaction by its ID. * * Important: do not call this on an object that has been/will be used to process an entire DB (via the `run` method, * for example). The two approaches to cofactor removal use the same cache objects which will be corrupted if the * object is reused (hence this method being protected). * * @param rxnId The id of the reaction to process. * @return The original and modified reaction object. * @throws IOException */ protected Pair<Reaction, Reaction> removeCofactorsFromOneReaction(Long rxnId) throws IOException { Reaction oldRxn = getNoSQLAPI().readReactionFromInKnowledgeGraph(rxnId); if (oldRxn == null) { LOGGER.error("Could not find reaction %d in the DB", rxnId); return null; } Set<Long> allChemicalIds = new HashSet<>(); allChemicalIds.addAll(Arrays.asList(oldRxn.getSubstrates())); allChemicalIds.addAll(Arrays.asList(oldRxn.getProducts())); allChemicalIds.addAll(Arrays.asList(oldRxn.getSubstrateCofactors())); allChemicalIds.addAll(Arrays.asList(oldRxn.getProductCofactors())); allChemicalIds.addAll(Arrays.asList(oldRxn.getCoenzymes())); for (Long id : allChemicalIds) { Chemical chem = getNoSQLAPI().readChemicalFromInKnowledgeGraph(id); if (chem == null) { LOGGER.error("Unable to find chemical %d for reaction %d in the DB", id, rxnId); return null; } // Simulate chemical migration so we play nicely with the cofactor remover. getOldChemIdToNewChemId().put(id, id); getNewChemIdToInchi().put(id, chem.getInChI()); chem = assignCofactorStatus(chem); if (chem.isCofactor()) { LOGGER.info("Found participating cofactor %d: %s", chem.getUuid(), chem.getInChI()); } } Reaction newRxn = new Reaction( -1, oldRxn.getSubstrates(), oldRxn.getProducts(), oldRxn.getSubstrateCofactors(), oldRxn.getProductCofactors(), oldRxn.getCoenzymes(), oldRxn.getECNum(), oldRxn.getConversionDirection(), oldRxn.getPathwayStepDirection(), oldRxn.getReactionName(), oldRxn.getRxnDetailType() ); findAndIsolateCoenzymesFromReaction(newRxn); newRxn = runSpecializedReactionProcessing(newRxn, -1L); return Pair.of(oldRxn, newRxn); } }